# Load packages
library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(DataExplorer)
library(tidyverse)
library(sf)
library(rnaturalearth)
library(lme4)         
library(emmeans)      
library(tidyr)
library(broom)
library(readxl)


CONV_CA_Ratio <- read_csv(
  "D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/CONV_CA_Ratio.csv",
  col_types = cols(
    IQR_weed1_complexity = col_character()
  )
)

# Add slope to CAONV_CA_Ratio

slope <- read_excel("data/raw/slope.xlsx")
slope$Farm_ID <- sub("_[A-Za-z]$", "", slope$Farm_ID)
slope <- slope %>%
  distinct(Farm_ID, .keep_all = TRUE)
colnames(slope)[colnames(slope) == "Farm_ID"] <- "IQR_block"
library(dplyr)
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  left_join(slope %>% dplyr::select(IQR_block, Slope),
            by = "IQR_block")
weeds <- read_excel("data/raw/weeds.xlsx")
library(dplyr)

National AEZ (10) or OAF-Environemtns (4)?

We used OAF_AEZ (hereafter IQF_environment) for the initial evaluation and characterization of Conservation Agriculture (CA) effects. IQF_environment captures key biophysical and agronomic drivers of yield variation (e.g., moisture regime, elevation, and temperature) while maintaining a parsimonious structure with only four categories. Compared with the national AEZ classification (IQF_agzone), which includes a larger number of categories, IQF_environment provides greater statistical power to detect environmental main effects and interactions in models assessing overall CA performance. IQF_environment is also closely aligned with crop-specific agronomic realities, including maize adaptation zones (highland vs. mid-altitude) and bean growth habits (bush vs. climbing), making it well suited for characterizing general treatment responses across environments.

In subsequent driver analyses, where the objective was to better explain environmental heterogeneity in yield responses, we incorporated IQF_agzone. Although more complex, IQF_agzone explained a larger proportion of variance in yield ratio (marginal R² = 0.117 vs. 0.060; conditional R² = 0.255 vs. 0.228) and showed stronger zone-related effect sizes. Therefore, it provided finer environmental differentiation when modeling drivers of performance variability. In summary, IQF_environment was retained for the initial CA effect assessment to maximize statistical power and interpretability, while IQF_agzone was incorporated in driver analyses to capture more detailed environmental heterogeneity.

# ================================
# Compare zoning systems (fit2)
# ================================

library(lme4)
library(car)
library(effectsize)
library(performance)

# ---- Fit models ----

# IQF_environment (OAF_AEZ)
fit2_conv_env <- lmer(
  log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +
    crop:IQF_environment +
    year:IQF_environment +
    year:crop:IQF_environment +
    (1 | IQR_block),
  data = CONV_CA_Ratio
)

# IQF_agzone
fit2_conv_agzone <- lmer(
  log(Y_ratio + 1) ~ crop + year + crop:year + IQF_agzone +
    crop:IQF_agzone +
    year:IQF_agzone +
    year:crop:IQF_agzone +
    (1 | IQR_block),
  data = CONV_CA_Ratio
)

# ---- Model comparison ----

cat("\n=== AIC / BIC ===\n")
## 
## === AIC / BIC ===
print(AIC(fit2_conv_env, fit2_conv_agzone))
##                  df       AIC
## fit2_conv_env    18 -1873.847
## fit2_conv_agzone 43 -1832.201
print(BIC(fit2_conv_env, fit2_conv_agzone))
##                  df       BIC
## fit2_conv_env    18 -1766.859
## fit2_conv_agzone 43 -1576.618
# ---- Type III ANOVA ----

cat("\n=== ANOVA: Environment ===\n")
## 
## === ANOVA: Environment ===
anova_env <- Anova(fit2_conv_env, type = 3)
print(anova_env)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(Y_ratio + 1)
##                             Chisq Df Pr(>Chisq)    
## (Intercept)               17.9908  1  2.220e-05 ***
## crop                      21.1971  1  4.144e-06 ***
## year                       4.2321  1   0.039667 *  
## IQF_environment           26.0883  3  9.140e-06 ***
## crop:year                 21.7159  1  3.162e-06 ***
## crop:IQF_environment      16.1789  3   0.001042 ** 
## year:IQF_environment      25.0575  3  1.502e-05 ***
## crop:year:IQF_environment 15.9107  3   0.001183 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== ANOVA: AgZone ===\n")
## 
## === ANOVA: AgZone ===
anova_agz <- Anova(fit2_conv_agzone, type = 3)
print(anova_agz)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(Y_ratio + 1)
##                        Chisq Df Pr(>Chisq)    
## (Intercept)          30.0357  1  4.242e-08 ***
## crop                  8.2825  1   0.004003 ** 
## year                 16.5780  1  4.669e-05 ***
## IQF_agzone           54.2814 10  4.300e-08 ***
## crop:year             8.4376  1   0.003675 ** 
## crop:IQF_agzone      11.5932  9   0.237229    
## year:IQF_agzone      53.6382  9  2.212e-08 ***
## crop:year:IQF_agzone 11.4518  9   0.246008    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ---- Partial Eta² ----

cat("\n=== Partial Eta²: Environment ===\n")
## 
## === Partial Eta²: Environment ===
eta_env <- eta_squared(fit2_conv_env)
print(eta_env)
## # Effect Size for ANOVA (Type III)
## 
## Parameter                 | Eta2 (partial) |       95% CI
## ---------------------------------------------------------
## crop                      |           0.03 | [0.02, 1.00]
## year                      |           0.01 | [0.01, 1.00]
## IQF_environment           |           0.01 | [0.01, 1.00]
## crop:year                 |           0.03 | [0.02, 1.00]
## crop:IQF_environment      |       6.59e-03 | [0.00, 1.00]
## year:IQF_environment      |           0.01 | [0.01, 1.00]
## crop:year:IQF_environment |       6.47e-03 | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
cat("\n=== Partial Eta²: AgZone ===\n")
## 
## === Partial Eta²: AgZone ===
eta_agz <- eta_squared(fit2_conv_agzone)
print(eta_agz)
## # Effect Size for ANOVA (Type III)
## 
## Parameter            | Eta2 (partial) |       95% CI
## ----------------------------------------------------
## crop                 |           0.02 | [0.01, 1.00]
## year                 |       4.49e-03 | [0.00, 1.00]
## IQF_agzone           |           0.04 | [0.02, 1.00]
## crop:year            |           0.02 | [0.01, 1.00]
## crop:IQF_agzone      |       4.98e-03 | [0.00, 1.00]
## year:IQF_agzone      |           0.04 | [0.02, 1.00]
## crop:year:IQF_agzone |       4.92e-03 | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
# ---- R² ----

cat("\n=== R²: Environment ===\n")
## 
## === R²: Environment ===
r2_env <- r2(fit2_conv_env)
print(r2_env)
## # R2 for Mixed Models
## 
##   Conditional R2: 0.228
##      Marginal R2: 0.060
cat("\n=== R²: AgZone ===\n")
## 
## === R²: AgZone ===
r2_agz <- r2(fit2_conv_agzone)
print(r2_agz)
## # R2 for Mixed Models
## 
##   Conditional R2: 0.255
##      Marginal R2: 0.117
# ---- Diagnostics (optional) ----

par(mfrow = c(1,2))
plot(fit2_conv_env)

qqnorm(resid(fit2_conv_env)); qqline(resid(fit2_conv_env))

Mix model to evaluate effect of crop, season, year and interactions on CA:CONV ratio

Description of the analysis: Linear mixed-effects model fitted to the log-transformed yield ratio (log(Y_ratio + 1)) comparing Conservation Agriculture (CA) to Conventional Agriculture (CONV). The model included:Fixed effects: crop, year, IQF_environment, and all two- and three-way interactions Random effect: IQR_block The treatment variable (Treat_code) was excluded in this model (fit2_conv) Model performance was compared to a simpler model (fit1_conv) using BIC. The summary of fixed effects and significance tests were obtained via ANOVA with Satterthwaite approximation.

Description of the results: On the analysis of the log-transformed CA:CONV yield ratio, there was a significant effect of crop, year, and environment, as well as their two- and three-way interactions.

# Model 1: no log transform, no Treat_code
fit1_conv <- lmer(
  Y_ratio ~ crop + year + IQR_Season + IQF_environment +
    IQR_Season:IQF_environment +
    (1 | IQR_block),
  data = CONV_CA_Ratio
)

# Model 2: log transform, no Treat_code
fit2_conv <- lmer(
  log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +
    crop:IQF_environment +
    year:IQF_environment +
    year:crop:IQF_environment +
    year:crop +
    (1 | IQR_block),
  data = CONV_CA_Ratio
)

# Model comparison
BIC(fit1_conv, fit2_conv)
# View model summaries
summary(fit1_conv)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## Y_ratio ~ crop + year + IQR_Season + IQF_environment + IQR_Season:IQF_environment +  
##     (1 | IQR_block)
##    Data: CONV_CA_Ratio
## 
## REML criterion at convergence: 1872.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5822 -0.5427 -0.0954  0.4009  9.8223 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  IQR_block (Intercept) 0.01826  0.1351  
##  Residual              0.09644  0.3105  
## Number of obs: 2818, groups:  IQR_block, 565
## 
## Fixed effects:
##                                          Estimate Std. Error t value
## (Intercept)                               2.75394    1.07417   2.564
## cropMaize                                -0.09917    0.08647  -1.147
## year                                     -0.07516    0.04296  -1.749
## IQR_Season23B                            -0.09980    0.09560  -1.044
## IQR_Season24A                            -0.24542    0.05258  -4.668
## IQR_Season24B                            -0.34972    0.06028  -5.802
## IQF_environmentMid_Low_Dry               -0.02494    0.03979  -0.627
## IQF_environmentMid_Low_Wet               -0.01160    0.04143  -0.280
## IQF_environmentTransition                 0.01562    0.04527   0.345
## IQR_Season23B:IQF_environmentMid_Low_Dry -0.24710    0.05346  -4.622
## IQR_Season24A:IQF_environmentMid_Low_Dry  0.26474    0.05466   4.843
## IQR_Season24B:IQF_environmentMid_Low_Dry  0.19055    0.05409   3.523
## IQR_Season25A:IQF_environmentMid_Low_Dry -0.06885    0.09277  -0.742
## IQR_Season25B:IQF_environmentMid_Low_Dry -0.01160    0.05507  -0.211
## IQR_Season23B:IQF_environmentMid_Low_Wet -0.06178    0.05539  -1.115
## IQR_Season24A:IQF_environmentMid_Low_Wet  0.24877    0.05706   4.360
## IQR_Season24B:IQF_environmentMid_Low_Wet  0.15245    0.05622   2.712
## IQR_Season25A:IQF_environmentMid_Low_Wet  0.06260    0.09438   0.663
## IQR_Season25B:IQF_environmentMid_Low_Wet  0.05454    0.05756   0.948
## IQR_Season23B:IQF_environmentTransition  -0.10401    0.06086  -1.709
## IQR_Season24A:IQF_environmentTransition   0.17929    0.06288   2.851
## IQR_Season24B:IQF_environmentTransition   0.15105    0.06199   2.437
## IQR_Season25A:IQF_environmentTransition   0.03154    0.09673   0.326
## IQR_Season25B:IQF_environmentTransition   0.04978    0.06287   0.792
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
summary(fit2_conv)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +  
##     crop:IQF_environment + year:IQF_environment + year:crop:IQF_environment +  
##     year:crop + (1 | IQR_block)
##    Data: CONV_CA_Ratio
## 
## REML criterion at convergence: -1909.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2810 -0.5815 -0.0355  0.4949  6.5777 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  IQR_block (Intercept) 0.005424 0.07365 
##  Residual              0.024837 0.15760 
## Number of obs: 2818, groups:  IQR_block, 565
## 
## Fixed effects:
##                                           Estimate Std. Error t value
## (Intercept)                                1.10145    0.25968   4.242
## cropMaize                                  2.18031    0.47356   4.604
## year                                      -0.02227    0.01083  -2.057
## IQF_environmentMid_Low_Dry                -1.71564    0.33938  -5.055
## IQF_environmentMid_Low_Wet                -0.79508    0.35516  -2.239
## IQF_environmentTransition                 -0.98950    0.38866  -2.546
## cropMaize:year                            -0.09316    0.01999  -4.660
## cropMaize:IQF_environmentMid_Low_Dry       0.43874    0.56254   0.780
## cropMaize:IQF_environmentMid_Low_Wet      -1.24660    0.58205  -2.142
## cropMaize:IQF_environmentTransition       -0.88133    0.61635  -1.430
## year:IQF_environmentMid_Low_Dry            0.07026    0.01415   4.965
## year:IQF_environmentMid_Low_Wet            0.03404    0.01482   2.298
## year:IQF_environmentTransition             0.04245    0.01621   2.619
## cropMaize:year:IQF_environmentMid_Low_Dry -0.01417    0.02366  -0.599
## cropMaize:year:IQF_environmentMid_Low_Wet  0.05497    0.02448   2.245
## cropMaize:year:IQF_environmentTransition   0.03960    0.02589   1.530
# Fixed effects significance
anova(fit2_conv)
# Residual diagnostics
plot(fit2_conv)

qqnorm(resid(fit2_conv)); qqline(resid(fit2_conv))

Description of the performance of CA across Environments and Districts, and from season 1 to 3

Description of the analysis: Used eemeans from the mixed-effects model to summarize the overall CA:CONV yield ratio, and to assess the effects of crop, year, and environment. Results were back-transformed for interpretation on the original yield ratio scale. Interaction effects were explored and visualized.

Description of the results: Conventional outperforms CA as a whole experimental average; overall CA:CONV ratio = 0.787 (eemeans for the same model shown above). Advantage of CONV was similar for maize and beans (t-tests of individual parameter estimates p=0.7). Although there was a significant effect of year, there was no trend for CA to perform better from year 1 to 3. Also, there was a clear effect of the environment with transition and mid-low-wet being more favourable for CA

Overal difference

overall_emm <- emmeans(fit2_conv, ~ 1, type = "response")
print(overall_emm)
##  1       response     SE  df lower.CL upper.CL
##  overall    0.779 0.0081 580    0.763    0.795
## 
## Results are averaged over the levels of: crop, IQF_environment 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log(mu + 1) scale

eemeans to interpret the effects of categorical predictors and their interactions

library(emmeans)

# Compute emmeans for crop within year × environment
emm_crop_year_env <- emmeans(fit2_conv, ~ crop | year * IQF_environment, type = "response")
emm_df <- as.data.frame(emm_crop_year_env)

Visualization of results

ggplot(emm_df, aes(x = IQF_environment, y = response, color = crop)) +
  geom_point(position = position_dodge(0.4), size = 3) +
  geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL),
                position = position_dodge(0.4), width = 0.2) +
  facet_wrap(~ year, nrow = 1) +
  labs(
    title = "Estimated Yield Ratio (CA / CONV) by Crop and Environment",
    x = "Environment",
    y = "Estimated Yield Ratio (CA:CT)",
    color = "Crop"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 8, angle = 45, hjust = 1) 
  )

## Does the % wins of CA increse progresibely from year 1 to 3? Separated by crop

library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)

# 1. Prepare data
trend_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    year %in% c("23", "24", "25"),
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    year = as.numeric(year),
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  )

# 2. Function to compute slope counts
get_slope_label <- function(df) {
  # One slope per block
  slopes <- df %>%
    group_by(IQR_block) %>%
    filter(n_distinct(year) >= 2) %>%
    summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
    mutate(
      direction = case_when(
        slope > 0 ~ "↑ Positive",
        slope < 0 ~ "↓ Negative",
        TRUE ~ "= Flat"
      )
    ) %>%
    count(direction) %>%
    complete(direction, fill = list(n = 0))

  # Format label text
  paste0(
    "↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
    "↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
    "= Flat: ", slopes$n[slopes$direction == "= Flat"]
  )
}


# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))

# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)


# 3. MAIZE plot

p_maize <- ggplot(
  trend_data %>% filter(crop_type == "Maize"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "steelblue",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_maize,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Maize",
    x = "Year",
    y = "Yield Ratio (CA:CT)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16)
  )

# 4. BEANS plot
p_beans <- ggplot(
  trend_data %>% filter(crop_type == "Beans"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "darkorange",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_beans,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Beans",
    x = "Year",
    y = "Yield Ratio (CA:CT)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16)
  )

# 5. Print plots
print(p_maize)

print(p_beans)

What if we target the best agroecological-zones or districts?

Description of the analysis: We first classified each block into performance typologies (Very Poor to Good) based on its CA:CONV yield ratio across seasons.For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85 These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance. We then visualized the distribution of these typologies by environment and district. Finally, we summarized and plotted the average relative yield per block, aggregated by district, to highlight areas where CA performed relatively better.

Description of the results: Even if we look at the district level, the best districts have a poor CA performance on average. The district where CA performed better (as an average from the 6 seasons) was Gisagara, and even there, the average CA:CONV ratio was 0.95 and the % of plots where CA was => to CONV was 35%. Similarly only aroun 35% of th eplots in Gisagara can be defined as with CA having acceptable yield, the rest had poor or very poor.

First lets define what is an acceptable yiel dloss

Based on the relation among farmers “preference” and relative yield of the season

Based on the focus groups discussions

## Based on the focus groups discussions

# --- Sort data by Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Y_ratio), ]

# --- Create groups of 15 consecutive observations ---
CONV_CA_Ratio$group10 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/15), each = 15)[1:nrow(CONV_CA_Ratio)]

# --- Aggregate to get group means ---
library(dplyr)

grouped_data <- CONV_CA_Ratio %>%
  group_by(group10) %>%
  summarise(
    mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    successes = sum(DQ_future_adoption, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )

grouped_data$failures <- grouped_data$n - grouped_data$successes
grouped_data$mean_DQ  <- grouped_data$successes / grouped_data$n

# --- Logistic regression ---
model <- glm(
  cbind(successes, failures) ~ mean_Y_ratio,
  family = binomial,
  data = grouped_data
)

# --- Compute McFadden pseudo R² ---
null_model <- glm(
  cbind(successes, failures) ~ 1,
  family = binomial,
  data = grouped_data
)

R2_McFadden <- 1 - (as.numeric(logLik(model)) /
                    as.numeric(logLik(null_model)))

R2_McFadden
## [1] 0.2263066
# --- Compute Y_ratio where predicted probability = 0.7 ---
beta0 <- coef(model)[1]
beta1 <- coef(model)[2]

target_p <- 0.7
logit_target <- log(target_p / (1 - target_p))

Y_ratio_07 <- (logit_target - beta0) / beta1

# --- Create smooth prediction curve ---
newdata <- data.frame(
  mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
                     max(grouped_data$mean_Y_ratio),
                     length.out = 200)
)

newdata$pred <- predict(model, newdata, type = "response")

# --- Plot ---
plot(grouped_data$mean_Y_ratio,
     grouped_data$mean_DQ,
     pch = 16,
     xlab = "Mean CA:CONV Yield Ratio (groups of 15)",
     ylab = "Proportion of future Adoption",
     ylim = c(0,1))

lines(newdata$mean_Y_ratio,
      newdata$pred,
      col = "blue",
      lwd = 2)

# --- Reference lines ---
abline(h = 0.7, col = "red", lty = 2)
abline(v = Y_ratio_07, col = "darkgreen", lwd = 2)

# --- Annotations ---
text(min(grouped_data$mean_Y_ratio), 0.76,
     labels = "Proportion = 0.7",
     col = "red",
     pos = 4)

text(Y_ratio_07, 0.2,
     labels = paste0("Y_ratio = ", round(Y_ratio_07, 3)),
     pos = 4,
     col = "darkgreen")

# --- Add R² to plot ---
text(max(grouped_data$mean_Y_ratio),
     0.95,
     labels = paste0("McFadden R² = ",
                     round(R2_McFadden, 3)),
     pos = 2)

### Smae but 2-segments model instead of sigmoidela

## Based on the focus groups discussions
library(dplyr)
library(segmented)

# --- Sort data by Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Y_ratio), ]

# --- Create groups of 15 ---
CONV_CA_Ratio$group15 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/20), each = 20)[1:nrow(CONV_CA_Ratio)]

# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
  group_by(group15) %>%
  summarise(
    mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
    .groups = "drop"
  )

# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Y_ratio, data = grouped_data)

seg_model <- segmented(lm_model,
                       seg.Z = ~ mean_Y_ratio,
                       npsi = 1)

# --- Extract breakpoint ---
breakpoint <- seg_model$psi[2]

# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(seg_model,
                           newdata = data.frame(mean_Y_ratio = breakpoint))

# --- Compute R² ---
R2 <- summary(seg_model)$r.squared

# --- Define NEW target dynamically ---
target <- Y_at_breakpoint - 0.15

# --- Prediction grid ---
newdata <- data.frame(
  mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
                     max(grouped_data$mean_Y_ratio),
                     length.out = 500)
)

newdata$pred <- predict(seg_model, newdata)

# --- Compute Yield Ratio where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Y_ratio_target <- newdata$mean_Y_ratio[idx]

# --- Plot ---
plot(grouped_data$mean_Y_ratio,
     grouped_data$mean_DQ,
     pch = 16,
     xlab = "Mean CA:CT Yield Ratio (groups of 15)",
     ylab = "Proportion of future Adoption",
     ylim = c(0,1))

lines(newdata$mean_Y_ratio,
      newdata$pred,
      col = "blue",
      lwd = 2)

# --- Breakpoint ---
abline(v = breakpoint,
       col = "darkgreen",
       lwd = 2,
       lty = 2)

abline(h = Y_at_breakpoint,
       col = "darkgreen",
       lty = 3)

points(breakpoint, Y_at_breakpoint,
       pch = 19, col = "darkgreen")

# --- New dynamic target ---
abline(h = target,
       col = "red",
       lty = 2)

abline(v = Y_ratio_target,
       col = "red",
       lwd = 2)

points(Y_ratio_target, target,
       pch = 19, col = "red")

# --- Annotations ---
text(breakpoint, 0.0,
     labels = paste0("Breakpoint = ",
                     round(breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(breakpoint, Y_at_breakpoint + 0.05,
     labels = paste0("Adoption at Break = ",
                     round(Y_at_breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(Y_ratio_target, 0.2,
     labels = paste0("Yield Ratio (Break − 0.15) = ",
                     round(Y_ratio_target, 3)),
     col = "red",
     pos = 4)

text(max(grouped_data$mean_Y_ratio),
     0.95,
     labels = paste0("R² = ",
                     round(R2, 3)),
     pos = 2)

### Same as avobe but only for farmers that perceived erosion reduction by CA AND have large slope >5

## Based on the focus groups discussions
library(dplyr)
library(segmented)

# --- Filter to DG_erosion_reduction_rate == 1 ---
CONV_subset <- CONV_CA_Ratio %>%
  filter(Slope > 4) %>%
  filter(DQ_erosion_reduction_rate == 1)

# --- Sort data by Y_ratio ---
CONV_subset <- CONV_subset[order(CONV_subset$Y_ratio), ]

# --- Create groups of 20 ---
CONV_subset$group20 <- rep(1:ceiling(nrow(CONV_subset)/20), each = 20)[1:nrow(CONV_subset)]

# --- Aggregate ---
grouped_data <- CONV_subset %>%
  group_by(group20) %>%
  summarise(
    mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
    .groups = "drop"
  )

# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Y_ratio, data = grouped_data)

seg_model <- segmented(lm_model,
                       seg.Z = ~ mean_Y_ratio,
                       npsi = 1)

# --- Extract breakpoint ---
breakpoint <- seg_model$psi[2]

# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(
  seg_model,
  newdata = data.frame(mean_Y_ratio = breakpoint)
)

# --- Compute R² ---
R2 <- summary(seg_model)$r.squared

# --- Define dynamic target ---
target <- Y_at_breakpoint - 0.15

# --- Prediction grid ---
newdata <- data.frame(
  mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
                     max(grouped_data$mean_Y_ratio),
                     length.out = 500)
)

newdata$pred <- predict(seg_model, newdata)

# --- Compute Yield Ratio where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Y_ratio_target <- newdata$mean_Y_ratio[idx]

# --- Plot ---
plot(grouped_data$mean_Y_ratio,
     grouped_data$mean_DQ,
     pch = 16,
     xlab = "Mean CA:CONV Yield Ratio (DG_erosion_reduction_rate = 1)",
     ylab = "Proportion of future Adoption",
     ylim = c(0,1))

lines(newdata$mean_Y_ratio,
      newdata$pred,
      col = "blue",
      lwd = 2)

# --- Breakpoint ---
abline(v = breakpoint,
       col = "darkgreen",
       lwd = 2,
       lty = 2)

abline(h = Y_at_breakpoint,
       col = "darkgreen",
       lty = 3)

points(breakpoint, Y_at_breakpoint,
       pch = 19, col = "darkgreen")

# --- Target line ---
abline(h = target,
       col = "red",
       lty = 2)

abline(v = Y_ratio_target,
       col = "red",
       lwd = 2)

points(Y_ratio_target, target,
       pch = 19, col = "red")

# --- Annotations ---
text(breakpoint, 0.05,
     labels = paste0("Breakpoint = ",
                     round(breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(breakpoint, Y_at_breakpoint + 0.05,
     labels = paste0("Adoption at Break = ",
                     round(Y_at_breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(Y_ratio_target, 0.15,
     labels = paste0("Yield Ratio (Break − 0.15) = ",
                     round(Y_ratio_target, 3)),
     col = "red",
     pos = 4)

text(max(grouped_data$mean_Y_ratio),
     0.95,
     labels = paste0("R² = ",
                     round(R2, 3)),
     pos = 2)

Lets explore also how it looks when we target different slopes levels

## Based on the focus groups discussions

library(dplyr)
library(ggplot2)

# --- Create slope categories ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  mutate(
    slope_cat = case_when(
      Slope <= 5 ~ "Low (≤5)",
      Slope > 5 & Slope <= 15 ~ "Mid (5–15)",
      Slope > 15 ~ "High (>15)"
    )
  )

# --- Sort within slope category ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  arrange(slope_cat, Y_ratio)

# --- Create groups of 15 within each slope category ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  group_by(slope_cat) %>%
  mutate(group15 = ceiling(row_number() / 20)) %>%
  ungroup()

# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
  group_by(slope_cat, group15) %>%
  summarise(
    mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    successes = sum(DQ_future_adoption, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  mutate(
    failures = n - successes,
    mean_DQ = successes / n
  )

# --- Fit logistic model per slope category ---
models <- grouped_data %>%
  group_by(slope_cat) %>%
  group_map(~ glm(cbind(successes, failures) ~ mean_Y_ratio,
                  family = binomial,
                  data = .x))

names(models) <- unique(grouped_data$slope_cat)

# --- Compute threshold Y_ratio where p = 0.7 for each slope ---
thresholds <- lapply(models, function(m) {
  beta0 <- coef(m)[1]
  beta1 <- coef(m)[2]
  logit_target <- log(0.7 / (1 - 0.7))
  (logit_target - beta0) / beta1
})

threshold_df <- data.frame(
  slope_cat = names(thresholds),
  Y_ratio_07 = unlist(thresholds),
  prop = 0.7
)

# --- Plot with facets ---
ggplot(grouped_data, aes(x = mean_Y_ratio, y = mean_DQ)) +
  geom_point() +
  stat_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE,
              color = "blue") +
  geom_hline(yintercept = 0.7, color = "red", linetype = "dashed") +
  geom_vline(data = threshold_df,
             aes(xintercept = Y_ratio_07),
             color = "darkgreen",
             linewidth = 1) +
  geom_point(data = threshold_df,
             aes(x = Y_ratio_07, y = prop),
             inherit.aes = FALSE,
             color = "black") +
  geom_text(data = threshold_df,
            aes(x = Y_ratio_07, y = 0.15,
                label = paste0("Y_ratio = ",
                               round(Y_ratio_07, 3))),
            color = "darkgreen",
            inherit.aes = FALSE) +
  facet_wrap(~ slope_cat) +
  labs(
    x = "Mean Y_ratio (groups of 15)",
    y = "Proportion DG_future_adoption"
  ) +
  ylim(0,1) +
  theme_minimal()

### Another way

## Based on the focus groups discussions
library(dplyr)
library(segmented)

# --- Sort data by Slope ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Slope), ]

# --- Create groups of 20 ---
CONV_CA_Ratio$group20 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/20), each = 20)[1:nrow(CONV_CA_Ratio)]

# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
  group_by(group20) %>%
  summarise(
    mean_Slope = mean(Slope, na.rm = TRUE),
    mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
    .groups = "drop"
  )

# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Slope, data = grouped_data)

seg_model <- segmented(lm_model,
                       seg.Z = ~ mean_Slope,
                       npsi = 1)

# --- Extract breakpoint (Slope value) ---
breakpoint <- seg_model$psi[2]

# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(
  seg_model,
  newdata = data.frame(mean_Slope = breakpoint)
)

# --- Compute R² ---
R2 <- summary(seg_model)$r.squared

# --- Define dynamic target ---
target <- Y_at_breakpoint - 0.15

# --- Prediction grid ---
newdata <- data.frame(
  mean_Slope = seq(min(grouped_data$mean_Slope),
                   max(grouped_data$mean_Slope),
                   length.out = 500)
)

newdata$pred <- predict(seg_model, newdata)

# --- Compute Slope where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Slope_target <- newdata$mean_Slope[idx]

# --- Plot ---
plot(grouped_data$mean_Slope,
     grouped_data$mean_DQ,
     pch = 16,
     xlab = "Mean Slope (groups of 20)",
     ylab = "Proportion of future Adoption",
     ylim = c(0,1))

lines(newdata$mean_Slope,
      newdata$pred,
      col = "blue",
      lwd = 2)

# --- Breakpoint ---
abline(v = breakpoint,
       col = "darkgreen",
       lwd = 2,
       lty = 2)

abline(h = Y_at_breakpoint,
       col = "darkgreen",
       lty = 3)

points(breakpoint, Y_at_breakpoint,
       pch = 19, col = "darkgreen")

# --- Target line ---
abline(h = target,
       col = "red",
       lty = 2)

abline(v = Slope_target,
       col = "red",
       lwd = 2)

points(Slope_target, target,
       pch = 19, col = "red")

# --- Annotations ---
text(breakpoint, 0.05,
     labels = paste0("Breakpoint (Slope) = ",
                     round(breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(breakpoint, Y_at_breakpoint + 0.05,
     labels = paste0("Adoption at Break = ",
                     round(Y_at_breakpoint, 3)),
     col = "darkgreen",
     pos = 4)

text(Slope_target, 0.15,
     labels = paste0("Slope (Break − 0.15) = ",
                     round(Slope_target, 3)),
     col = "red",
     pos = 4)

text(max(grouped_data$mean_Slope),
     0.95,
     labels = paste0("R² = ",
                     round(R2, 3)),
     pos = 2)

# --- Sort data by Crop and Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$crop,
                                     CONV_CA_Ratio$Y_ratio), ]

library(dplyr)

# --- Create groups of 10 consecutive observations WITHIN each Crop ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  group_by(crop) %>%
  mutate(group10 = ceiling(row_number() / 10)) %>%
  ungroup()

# --- Aggregate to get group means per Crop ---
grouped_data <- CONV_CA_Ratio %>%
  group_by(crop, group10) %>%
  summarise(
    mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    successes = sum(DQ_future_adoption, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )

grouped_data$failures <- grouped_data$n - grouped_data$successes
grouped_data$mean_DQ  <- grouped_data$successes / grouped_data$n

# --- Logistic regression including Crop ---
model <- glm(
  cbind(successes, failures) ~ mean_Y_ratio + crop,
  family = binomial,
  data = grouped_data
)

summary(model)
## 
## Call:
## glm(formula = cbind(successes, failures) ~ mean_Y_ratio + crop, 
##     family = binomial, data = grouped_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.04297    0.12410  -8.404   <2e-16 ***
## mean_Y_ratio  2.06496    0.15242  13.548   <2e-16 ***
## cropMaize    -0.13291    0.08141  -1.633    0.103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 658.55  on 282  degrees of freedom
## Residual deviance: 429.21  on 280  degrees of freedom
## AIC: 1114.3
## 
## Number of Fisher Scoring iterations: 5
# --- Create prediction grid for each Crop ---
newdata <- expand.grid(
  mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
                     max(grouped_data$mean_Y_ratio),
                     length.out = 200),
  crop = unique(grouped_data$crop)
)

newdata$pred <- predict(model, newdata, type = "response")

# --- Plot ---
cols <- rainbow(length(unique(grouped_data$crop)))
crop_levels <- unique(grouped_data$crop)

plot(NULL,
     xlim = range(grouped_data$mean_Y_ratio),
     ylim = c(0, 1),
     xlab = "Mean Y_ratio (groups of 10)",
     ylab = "Proportion DQ_future_adoption")

for(i in seq_along(crop_levels)) {
  
  crop_i <- crop_levels[i]
  
  # Points
  points(
    grouped_data$mean_Y_ratio[grouped_data$crop == crop_i],
    grouped_data$mean_DQ[grouped_data$crop == crop_i],
    col = cols[i],
    pch = 16
  )
  
  # Logistic curve
  lines(
    newdata$mean_Y_ratio[newdata$crop == crop_i],
    newdata$pred[newdata$crop == crop_i],
    col = cols[i],
    lwd = 2
  )
}

legend("topleft",
       legend = crop_levels,
       col = cols,
       pch = 16,
       lwd = 2)

What about the effect of CA on erosion

library(dplyr)
library(tidyr)

# --- Create slope categories and remove NA ---
erosion_table <- CONV_CA_Ratio %>%
  filter(
    !is.na(DQ_erosion_reduction_rate),
    !is.na(Slope),
    !is.na(IQR_Season)
  ) %>%
  mutate(
    slope_cat = case_when(
      Slope <= 5 ~ "Low",
      Slope > 5 & Slope <= 15 ~ "Mid",
      Slope > 15 ~ "High"
    )
  ) %>%
  group_by(IQR_Season, slope_cat) %>%
  summarise(
    N = n(),
    perc = mean(DQ_erosion_reduction_rate) * 100,
    .groups = "drop"
  ) %>%
  mutate(
    value = paste0(round(perc, 1), "% (", N, ")")
  ) %>%
  dplyr::select(IQR_Season, slope_cat, value) %>%
  pivot_wider(
    names_from = slope_cat,
    values_from = value
  ) %>%
  arrange(IQR_Season)

erosion_table

Yield performance as a qualitative evaluation of multiple seasons

library(dplyr)
library(tidyr)

# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")

# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
  group_by(IQR_block) %>%
  summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
  filter(n_valid_seasons >= 4)

# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
  filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
         IQR_Season %in% seasons,
         is.finite(Y_ratio))

# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
  group_by(IQR_block, IQR_Season) %>%
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = IQR_Season,
    values_from = Y_ratio,
    names_prefix = "Y_ratio_"
  )

# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
  group_by(IQR_block) %>%
  summarise(
    n_seasons = n_distinct(IQR_Season),
    any_below_0.95 = any(Y_ratio < 0.95),
    mean_ratio = mean(Y_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    CA_typology = case_when(
      !any_below_0.95 ~ "Good",
      any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
      any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
      mean_ratio <= 0.85 ~ "Very Poor"
    )
  )

# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
  arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>%  # Prioritize 23A > 24A > 25A
  distinct(IQR_block, .keep_all = TRUE) %>%
  dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)


# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
  left_join(RY_wide, by = "IQR_block") %>%
  left_join(block_meta, by = "IQR_block")

lets add a new variable of CA performacne classification but with a threshold of 0.9

library(dplyr)
library(tidyr)

# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")

# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
  group_by(IQR_block) %>%
  summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
  filter(n_valid_seasons >= 4)

# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
  filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
         IQR_Season %in% seasons,
         is.finite(Y_ratio))

# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
  group_by(IQR_block, IQR_Season) %>%
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = IQR_Season,
    values_from = Y_ratio,
    names_prefix = "Y_ratio_"
  )

# Step 5: Compute performance typology
Perf_Type_09 <- filtered_yield %>%
  group_by(IQR_block) %>%
  summarise(
    n_seasons = n_distinct(IQR_Season),
    any_below_0.9 = any(Y_ratio < 0.9),
    mean_ratio = mean(Y_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    CA_typology_09 = case_when(
      !any_below_0.9 ~ "Good",
      any_below_0.9 & mean_ratio >= 0.9 ~ "Acceptable",
      any_below_0.9 & mean_ratio < 0.9 & mean_ratio > 0.8 ~ "Poor",
      mean_ratio <= 0.8 ~ "Very Poor"
    )
  )

# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
  arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>%  # Prioritize 23A > 24A > 25A
  distinct(IQR_block, .keep_all = TRUE) %>%
  dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)


# Step 7: Join all into Perf_Type
Perf_Type_09 <- Perf_Type_09 %>%
  left_join(RY_wide, by = "IQR_block") %>%
  left_join(block_meta, by = "IQR_block")

Yield performance by environment (OAF Agroecological zooning)

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "CA-Performance Typology Distribution by OAF-AEZ",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

Yield performance by districy

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_District) +
  labs(
    title = "CA-Performance Typology Distribution by District",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

Average yiel performance across seasons per district

library(dplyr)
library(ggplot2)
library(glue)

# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio)) %>%
  group_by(IQR_block, IQF_District) %>%
  summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 2. Summary for annotations
district_summary <- block_means %>%
  group_by(IQF_District) %>%
  summarise(
    avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
    n_total = n(),
    n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
    percent_ge1 = round(100 * n_ge1 / n_total)
  ) %>%
  mutate(
    label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
    y_pos = 1.5
  )

# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "gray90") +
  geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
  geom_text(
    data = district_summary,
    aes(x = IQF_District, y = y_pos, label = label),
    inherit.aes = FALSE,
    size = 2,
    vjust = 0
  ) +
  labs(
    title = "Distribution of Mean Y_ratio per Block by District",
    x = "District",
    y = "Mean Y_ratio per Block (across seasons)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

CA performance on individual plots rather than averages

Description of the analysis: We analyzed the block-level average performance of Conservation Agriculture (CA) relative to Conventional Agriculture (CONV) by calculating the mean CA:CONV yield ratio per block across all seasons (i.e., 3 years = 6 seasons, 3 maize + 3 beans). A cumulative distribution plot was used to visualize how many blocks reached or exceeded yield parity (Y_ratio ≥ 1). Additionally, density plots by agro-environment visualized how CA performance varied across zones.

Description of the results: As we showed previously, when examining the average performance of CA at the country, environment, or district level, results are discouraging, suggesting that a blanket recommendation of this package is unlikely to generate broad impact or adoption, even if we target the best environmetn or district. However, when zooming in to the plot level, there are a substantial number of cases where CA either outperforms CONV or has a similar performance. Specifically, CA outperforms CONV in 18% of blocks, and in 21% of cases achieves a yield ratio close to or above acceptable thresholds (Shown before in the figure of Performance Typologies)

library(dplyr)
library(ggplot2)

# 1. Summarize data to block level
block_data <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio)) %>%
  group_by(IQR_block) %>%
  summarise(
    avg_Y_ratio = mean(Y_ratio, na.rm = TRUE),
    IQF_environment = first(IQF_environment),  # to retain for optional grouping
    .groups = "drop"
  ) %>%
  arrange(avg_Y_ratio) %>%
  mutate(cum_percent = 100 * (row_number() / n()))

# 2. Plot cumulative distribution
ggplot(block_data, aes(x = avg_Y_ratio, y = cum_percent)) +
  geom_point(size = 2, alpha = 0.7, color = "darkorange") +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  scale_x_continuous(limits = c(0, 2.5)) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  labs(
    x = "Block-Level Average Yield Ratio (CA / CONV)",
    y = "Cumulative % of Blocks",
    title = "Cumulative Distribution of Relative Yield (CA vs CONV)",
    subtitle = "Each point is a block's average over all seasons; Dashed line = yield parity",
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 10, hjust = 0.5)
  )

##Visual distribution of CONV advantage vs CA_Yield

library(ggplot2)

ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
  geom_density(alpha = 0.3) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(title = "Density of Relative Yield (CA / CONV) by Agzone")

Drivers of yield performance

We start with the database of yield drivers that was already processed in the scrips: “data_mm_block” but we change the relative

data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")

drivers_data <- data_mm_block %>%
  mutate(
    CA_typology = fct_drop(CA_typology),

    IQR_block = factor(IQR_block),
    IQR_TF_tubura_client  = factor(IQR_TF_tubura_client),
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
    IQR_soil_texture      = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
    IQR_soil_color        = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
    IQF_position_slope    = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
    Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
  )

# We add the variable slope to drivers_data
drivers_data <- drivers_data %>%
  left_join(
    slope %>% dplyr::select(IQR_block, Slope),
    by = "IQR_block"
  )


str(drivers_data)
## tibble [473 × 19] (S3: tbl_df/tbl/data.frame)
##  $ IQR_block              : chr [1:473] "102_10" "102_11" "102_12" "102_13" ...
##  $ ICR_N_perc_23A         : num [1:473] 0.21 0.17 0.17 0.08 0.11 0.19 0.22 0.31 0.39 0.37 ...
##  $ ICR_Org_C_avg          : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
##  $ ICR_K_perc_23A         : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
##  $ ICR_Avail_P_avg        : num [1:473] 11.7 11.5 22 60.6 35.6 ...
##  $ ICR_arable_land_avg    : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
##  $ ICF_Alt_avg            : num [1:473] 1636 1293 1141 1207 1145 ...
##  $ ICR_rainfall_avg       : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
##  $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
##  $ ICF_planting_date      : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
##  $ IQR_TF_tubura_client   : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
##  $ IQR_plot_fert_quality  : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
##  $ IQR_soil_texture       : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ IQR_soil_color         : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
##  $ IQF_position_slope     : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
##  $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CA_typology            : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
##  $ IQF_environment        : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...
##  $ Slope                  : num [1:473] 15 15 15 20 12 14 15 8 10 8 ...

We paste the averaged relative yield along the 6 seasons to each bloc and we remove the of CA_typology

# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)

# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
  dplyr::group_by(IQR_block, IQR_Season) |>
  dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
  dplyr::group_by(IQR_block) |>
  dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |> 
  dplyr::left_join(avg_Y_ratio, by = "IQR_block")

We create a new variable “Y_ratio_2425”

This is the relative yield for maize and beans averaged per block, but exluding 2023 as the cover crop (mucuna) was too competitive and was later changed to hemp

library(dplyr)

# 2. Compute average Y_ratio for year 24 and 25, grouped by IQR_block
avg_Y_ratio_2425 <- CONV_CA_Ratio %>%
  filter(year %in% c(24, 25)) %>%                         # Keep only year 24 and 25
  group_by(IQR_block, IQR_Season) %>%                    # Group by block and season
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  group_by(IQR_block) %>%
  summarise(Y_ratio_2425 = mean(Y_ratio, na.rm = TRUE), .groups = "drop")  # Final average per block

# 3. Merge this into drivers_data by IQR_block
drivers_data <- drivers_data %>%
  left_join(avg_Y_ratio_2425, by = "IQR_block")

We create a new variable “Y_ratio_maize”

This is the relative yield for maize only, averaged for 3 years

library(dplyr)


# 2. Compute average Y_ratio for year == 25, grouped by IQR_block
Y_ratio_maize <- CONV_CA_Ratio %>%
  filter(crop == "Maize") %>%                          # Keep only year 25
  group_by(IQR_block, IQR_Season) %>%            # Group by block and season
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  group_by(IQR_block) %>%
  summarise(Y_ratio_maize = mean(Y_ratio, na.rm = TRUE), .groups = "drop")  # Final average per block

# 3. Merge this into drivers_data by IQR_block
drivers_data <- drivers_data %>%
  left_join(Y_ratio_maize, by = "IQR_block")

We also add the performance variable CA_Typology” and CA_Typology_09

library(dplyr)

# Merge CA_typology from Perf_Type using IQR_block as the key
drivers_data <- drivers_data |>
  left_join(Perf_Type |>
              dplyr::select(IQR_block, CA_typology),
            by = "IQR_block")

drivers_data <- drivers_data |>
  left_join(Perf_Type_09 |>
              dplyr::select(IQR_block, CA_typology_09),
            by = "IQR_block")

#Explore, visually the protential drivers and the relatio nwith Y_ration of the blocks

Cathegoric variables

Variables we have so far (after filtering some out and re-grouping cathegories with low N: ICR_Org_C_avg num: ICR_K_perc_23A num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICF_Alt_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” Output variables: avg_Y_ratio num: CA_typology factor:

CA Performance as affected by ICR_Org_C_avg

with avg_Y_ratio

Notes: when we look at all the database pooled together, there is no relation at all among SOC and performance of the CA. Howeever, where we split the analysis be Environment, then an interation becomes clear, where there is no relation in mid altitude and the relation is quite significant and positive in Highland and sligtly negative in transition. Conclusion. SOC should remain for the drivers analysis

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_Org_C_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

### What if we facet by environment? to see if interactions are masking the effect

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions (adjust if needed)
#----------------------------------------------
# Choose fixed position for annotation
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, lm line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Fit ANOVA model
model_anova <- aov(ICR_Org_C_avg ~ CA_typology, data = drivers_data)

# 2. Tukey HSD post-hoc test
tukey <- TukeyHSD(model_anova)

# 3. Extract group letters
letters_df <- multcompLetters(tukey$CA_typology[,"p adj"])$Letters
letters_df <- data.frame(CA_typology = names(letters_df), Letters = letters_df)

# 4. Calculate group means
means_df <- drivers_data %>%
  group_by(CA_typology) %>%
  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 5. Combine letters and means
plot_labels <- left_join(letters_df, means_df, by = "CA_typology")

# 6. Create boxplot with letters and means at fixed Y positions
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray", color = "black") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7.5, label = Letters),
            fontface = "bold", size = 5, color = "darkred") +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7, label = mean_label),
            fontface = "plain", size = 4, color = "black") +
  labs(
    title = "ICR_Org_C_avg by CA_typology",
    subtitle = "Letters indicate significant differences (Tukey HSD)",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +
  theme_minimal()

### What if we facet by Environment? myve some interactions masking the effect?

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------

get_letters <- function(df) {

  mod <- aov(ICR_Org_C_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)

  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------

ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +

  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

# Letters
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 9.5, label = Letters),
  inherit.aes = FALSE,
  fontface = "bold",
  size = 3,
  color = "darkred"
) +

# Means
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 8.25, label = mean_label),
  inherit.aes = FALSE,
  size = 3
) +


  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Org_C_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +

  theme_minimal()

## CA Performance as affected by ICR_K_perc_23A ### with avg_Y_ratio Notes: there is some minimal relation in highland. When we look at the correlation among SOC and K, its relatively high in highland. So we could just use SOC in the drivers analysis and exclude K Conclusion: Keep SCO but explcude K

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_K_perc_23A), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_K_perc_23A, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_K_perc_23A, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_K_perc_23A vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Potassium (%) in 23A",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

### with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_K_perc_23A ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_K_perc_23A)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_K_perc_23A)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 3.2, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2.8, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_K_perc_23A by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Potassium (%) in 23A"
  ) +

  theme_minimal()

## Since SOC semms to have some explanatory power and K very little, lets see the correlation among them Notes: there is quite a correlation among K and SOC in highland

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_K_perc_23A), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_K_perc_23A ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_K_perc_23A)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Potassium (23A)",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_K_perc_23A (Potassium %)"
  ) +
  theme_minimal()

### What if instead of contineous, we define deffitient or non defficiennt

library(dplyr)
library(ggplot2)
library(segmented)

# ===============================
# 1) Average env_index per block across seasons
# ===============================

env_block_avg <- CONV_CA_Ratio %>%
  filter(!is.na(IQR_block), !is.na(env_index)) %>%
  group_by(IQR_block) %>%
  summarise(
    env_index_avg = mean(env_index, na.rm = TRUE),
    .groups = "drop"
  )

# ===============================
# 2) Ensure drivers_data is block-level for K
# ===============================

k_block <- drivers_data %>%
  filter(!is.na(IQR_block), !is.na(ICR_K_perc_23A)) %>%
  group_by(IQR_block) %>%
  summarise(
    ICR_K_perc_23A = mean(ICR_K_perc_23A, na.rm = TRUE),
    .groups = "drop"
  )

# ===============================
# 3) Join datasets
# ===============================

plot_data <- env_block_avg %>%
  left_join(k_block, by = "IQR_block") %>%
  filter(!is.na(ICR_K_perc_23A))

cat("Blocks used:", nrow(plot_data), "\n")
## Blocks used: 459
# ===============================
# 4) Fit linear model
# ===============================

lm_model <- lm(env_index_avg ~ ICR_K_perc_23A,
               data = plot_data)

# ===============================
# 5) Fit segmented (breakpoint) model
# ===============================

seg_model <- segmented(
  lm_model,
  seg.Z = ~ ICR_K_perc_23A,
  psi = list(ICR_K_perc_23A = median(plot_data$ICR_K_perc_23A, na.rm = TRUE))
)

summary(seg_model)
## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = lm_model, seg.Z = ~ICR_K_perc_23A, psi = list(ICR_K_perc_23A = median(plot_data$ICR_K_perc_23A, 
##     na.rm = TRUE)))
## 
## Estimated Break-Point(s):
##                      Est. St.Err
## psi1.ICR_K_perc_23A  0.3  0.045
## 
## Coefficients of the linear terms:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.71785    0.06459  11.115  < 2e-16 ***
## ICR_K_perc_23A     1.06639    0.31448   3.391 0.000757 ***
## U1.ICR_K_perc_23A -1.00761    0.31623  -3.186       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2824 on 455 degrees of freedom
## Multiple R-Squared: 0.08489,  Adjusted R-squared: 0.07885 
## 
## Boot restarting based on 6 samples. Last fit:
## Convergence attained in 1 iterations (rel. change 1.5678e-07)
# Extract breakpoint
breakpoint <- seg_model$psi[2]
cat("Estimated Breakpoint:", round(breakpoint, 3), "\n")
## Estimated Breakpoint: 0.3
# Add predictions
plot_data$predicted <- predict(seg_model)

# ===============================
# 6) Plot
# ===============================

ggplot(plot_data,
       aes(x = ICR_K_perc_23A,
           y = env_index_avg)) +
  
  geom_point(alpha = 0.7,
             color = "steelblue",
             size = 2) +
  
  geom_line(aes(y = predicted),
            color = "red",
            size = 1.2) +
  
  geom_vline(xintercept = breakpoint,
             linetype = "dashed",
             color = "darkgreen",
             size = 1) +
  
  labs(
    title = "Breakpoint Model: Environmental Index vs Potassium (%)",
    subtitle = paste0("Estimated breakpoint ≈ ",
                      round(breakpoint, 3)),
    x = "ICR_K_perc_23A",
    y = "Mean env_index across seasons"
  ) +
  
  theme_minimal(base_size = 14)

Based on litterature: VLow: <0.2 meq/100g Low: >=0.2 and < 0.5 meq/100g. Opt: =>0.5meq/100g. https://link.springer.com/article/10.1186/s40066-018-0165-5#:~:text=Exchangeable%20K,-The%20soil%20exchangeable&text=The%20range%20of%20values%20recorded,their%20soil%20exchangeable%20K%2C%20respectively.

Then adjusted to our findings: VLow: <0.3 meq/100g Low: >=0.23 and < 0.56 meq/100g. Opt: =>0.6meq/100g.

library(dplyr)

drivers_data <- drivers_data %>%
  mutate(
    K_factor = case_when(
      ICR_K_perc_23A < 0.3 ~ "VLow",
      ICR_K_perc_23A >= 0.3 & ICR_K_perc_23A < 0.6 ~ "Low",
      ICR_K_perc_23A >= 0.6 ~ "Opt",
      TRUE ~ NA_character_
    ),
    K_factor = factor(K_factor,
                      levels = c("VLow", "Low", "Opt"),
                      ordered = TRUE)
  )

FIgure of K_Factor with Y_ratio

library(ggplot2)
library(dplyr)

# 1️⃣ Filter valid data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(K_factor)
  ) %>%
  droplevels()

# 2️⃣ Run ANOVA
anova_model <- aov(avg_Y_ratio ~ K_factor, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3️⃣ Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ K_factor, data = df_filt)$p.value

# 4️⃣ Build annotation label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5️⃣ Calculate group means
mean_df <- df_filt %>%
  group_by(K_factor) %>%
  summarise(
    mean_val = mean(avg_Y_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6️⃣ Plot
ggplot(df_filt, aes(x = K_factor, y = avg_Y_ratio)) +
  
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
  
  # Means under each box
  geom_text(
    data = mean_df,
    aes(
      x = K_factor,
      y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
      label = mean_label
    ),
    inherit.aes = FALSE,
    size = 4
  ) +
  
  # P-values annotation
  annotate(
    "text",
    x = Inf, y = Inf,
    hjust = 1.1, vjust = 1.5,
    label = test_label,
    size = 5
  ) +
  
  labs(
    title = "avg_Y_ratio by K_factor",
    subtitle = "ANOVA and Kruskal–Wallis test results shown",
    x = "Potassium Level (K_factor)",
    y = "Average Yield Ratio"
  ) +
  
  theme_minimal()

CA Performance as affected by ICR_Avail_P_avg

with avg_Y_ratio

NOtes: the values sees high in general and not relation with the yield of the plots (environmental index calculated as the ) Conclusions: poor

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Avail_P_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Avail_P_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Avail_P_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Avail_P_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Available Phosphorus (ppm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###ICR_Avail_P_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_Avail_P_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_Avail_P_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Avail_P_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 110, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 100, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Avail_P_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Available Phosphorus (ppm)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs. ICR_Avail_P_avg

First we check the values. I suspect they are toon high

library(dplyr)
library(ggplot2)

drivers_data_clean <- drivers_data %>%
  filter(!is.na(ICR_Avail_P_avg))

ggplot(drivers_data_clean,
       aes(x = ICR_Avail_P_avg)) +
  stat_ecdf(geom = "step", size = 1.2, color = "blue") +
  scale_x_continuous(
    breaks = seq(
      floor(min(drivers_data_clean$ICR_Avail_P_avg, na.rm = TRUE)),
      ceiling(max(drivers_data_clean$ICR_Avail_P_avg, na.rm = TRUE)),
      by = 15
    )
  ) +
  labs(
    title = "Cumulative Distribution of Available P",
    x = "ICR_Avail_P_avg",
    y = "Cumulative Proportion"
  ) +
  theme_minimal()

they still seem high, lets see the relation with the evironmetal index calculated before. This is an index of the yield of all the seasons in the block as compared to plots in the same seasson-district

library(dplyr)
library(ggplot2)
library(segmented)

# ===============================
# 1) Average env_index per block across seasons
# ===============================

env_block_avg <- CONV_CA_Ratio %>%
  filter(!is.na(IQR_block), !is.na(env_index)) %>%
  group_by(IQR_block) %>%
  summarise(
    env_index_avg = mean(env_index, na.rm = TRUE),
    .groups = "drop"
  )

# ===============================
# 2) Ensure drivers_data is block-level for P
# ===============================

p_block <- drivers_data %>%
  filter(!is.na(IQR_block), !is.na(ICR_Avail_P_avg)) %>%
  group_by(IQR_block) %>%
  summarise(
    ICR_Avail_P_avg = mean(ICR_Avail_P_avg, na.rm = TRUE),
    .groups = "drop"
  )

# ===============================
# 3) Join and restrict to P ≤ 70
# ===============================

plot_data <- env_block_avg %>%
  left_join(p_block, by = "IQR_block") %>%
  filter(!is.na(ICR_Avail_P_avg),
         ICR_Avail_P_avg <= 70)

cat("Blocks used:", nrow(plot_data), "\n")
## Blocks used: 442
# ===============================
# 4) Fit linear model
# ===============================

lm_model <- lm(env_index_avg ~ ICR_Avail_P_avg,
               data = plot_data)

# ===============================
# 5) Fit segmented (breakpoint) model
# ===============================

seg_model <- segmented(
  lm_model,
  seg.Z = ~ ICR_Avail_P_avg,
  psi = list(ICR_Avail_P_avg = 30)  # initial guess
)

summary(seg_model)
## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = lm_model, seg.Z = ~ICR_Avail_P_avg, psi = list(ICR_Avail_P_avg = 30))
## 
## Estimated Break-Point(s):
##                        Est. St.Err
## psi1.ICR_Avail_P_avg 4.829  0.999
## 
## Coefficients of the linear terms:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)         0.48109    0.32650   1.473    0.141
## ICR_Avail_P_avg     0.10610    0.08338   1.272    0.204
## U1.ICR_Avail_P_avg -0.10491    0.08339  -1.258       NA
## 
## Residual standard error: 0.2971 on 438 degrees of freedom
## Multiple R-Squared: 0.01508,  Adjusted R-squared: 0.008334 
## 
## Boot restarting based on 8 samples. Last fit:
## Convergence attained in 1 iterations (rel. change 0)
# Extract breakpoint
breakpoint <- seg_model$psi[2]
cat("Estimated Breakpoint:", round(breakpoint, 2), "\n")
## Estimated Breakpoint: 4.83
# Add predictions
plot_data$predicted <- predict(seg_model)

# ===============================
# 6) Plot
# ===============================

ggplot(plot_data,
       aes(x = ICR_Avail_P_avg,
           y = env_index_avg)) +
  
  geom_point(alpha = 0.7,
             color = "steelblue",
             size = 2) +
  
  geom_line(aes(y = predicted),
            color = "red",
            size = 1.2) +
  
  geom_vline(xintercept = breakpoint,
             linetype = "dashed",
             color = "darkgreen",
             size = 1) +
  
  scale_x_continuous(breaks = seq(0, 70, by = 10)) +
  
  labs(
    title = "Breakpoint Model: Environmental Index vs Available P",
    subtitle = paste0("Estimated breakpoint ≈ ",
                      round(breakpoint, 2)),
    x = "ICR_Avail_P_avg",
    y = "Mean env_index across seasons"
  ) +
  
  theme_minimal(base_size = 14)

### Add the fariable P_factor to drivers_data

library(dplyr)

drivers_data <- drivers_data %>%
  mutate(
    P_factor = case_when(
      ICR_Avail_P_avg < 5 ~ "low",
      ICR_Avail_P_avg >= 5 & ICR_Avail_P_avg < 15 ~ "mid",
      ICR_Avail_P_avg >= 15 ~ "high",
      TRUE ~ NA_character_
    ),
    P_factor = factor(P_factor,
                      levels = c("low", "mid", "high"))
  )
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_Avail_P_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_Avail_P_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_Avail_P_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Available Phosphorus",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_Avail_P_avg (Phosphorus ppm)"
  ) +
  theme_minimal()

### Now we do cathegoris rathen than num Very Low/Critical Range: =<5 ppm Deficient (Critical Level): 5- 10 ppm Low to Moderate Range: 10-15 ppm Optimal/Sufficient Range: > 15 ppm

CA Performance as affected by ICR_arable_land_avg

NOtes: no correlation when pooled together but a trant in 2 out of 4 environments. Moreover it is a hosehold variable that is ususally used to explain perfomrance of practices Conclusions: Keep it for drivers analysis

with avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_arable_land_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    x = "Arable Land (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

### In facet by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_arable_land_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_arable_land_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Arable land (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###ICR_arable_land_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

get_letters <- function(df) {
  mod <- aov(ICR_arable_land_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_arable_land_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_arable_land_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))

ggplot(drivers_data, aes(x = CA_typology, y = ICR_arable_land_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 90, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 82, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_arable_land_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Arable land (%)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs ICR_arable_land_avg

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_arable_land_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_arable_land_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

label_y <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_arable_land_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Arable Land",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_arable_land_avg (Arable land %)"
  ) +
  theme_minimal()

## CA Performance as affected by ICF_Alt_avg

NOtes: there seems to be no correlation when pooled together and even less when split by environment. Environment clasiffication is mainly based on altitude so if we are using em=nvironments as driver, altitude can be eliminated Conclusions: Do not use further in drivers analysis

###All together

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_Alt_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    x = "Altitude (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

with avg_Y_ratio

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICF_Alt_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_Alt_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICF_Alt_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Altitude (m)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###ICF_Alt_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function for letters
get_letters <- function(df) {
  mod <- aov(ICF_Alt_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICF_Alt_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_Alt_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 0)))

# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICF_Alt_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2000, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 1800, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Altitude (m)"
  ) +
  theme_minimal()

CA Performance as affected by ICR_rainfall_avg

NOtes: Rain seems to partially explain CA performance, with a cuadratice-model response when pooled all environmetns together, with 4 mm per day as optimumn. However, when split by environments, linear models can fit as good Conclusions: Keep, consider cuadratic relation and not linear if not grouped within environments ###Simple regression: ICR_rainfall_avg vs avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg, data = drivers_data)

# 2. Extract p-value and R-squared from model summary
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5, color = "black") +
  theme_minimal()

### What wbout tieh a cuadratic model?

library(ggplot2)

# 1. Fit quadratic model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2),
            data = drivers_data)

# 2. Extract p-value and R²
p_val <- summary(model)$coefficients["I(ICR_rainfall_avg^2)", 4]
r2 <- summary(model)$r.squared

# 3. Create plot with p and R²
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
annotate("text", 
         x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
         label = paste0("p = ", signif(p_val, 3),
                        "\nR^2 = ", round(r2, 2)),
         size = 5)

###Faceted regression: ICR_rainfall_avg vs avg_Y_ratio by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit separate linear models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

# 2. Annotation positions
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_rainfall_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

### same as avobe but with cuadratic model

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit quadratic models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2), data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients["I(ICR_rainfall_avg^2)", 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),

    # IMPORTANT: use R^2 (not R²)
    stat_label = paste0(
      "p = ", signif(p_value, 3),
      "\nR^2 = ", round(r_squared, 2)
    )
  ) %>%
  dplyr::select(IQF_environment, stat_label)


stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
labs(
  title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
  subtitle = "Quadratic regressions per environment with p-values and R^2",
  x = "Rainfall (avg, mm)",
  y = "Average Yield Ratio"
)

###Boxplot: ICR_rainfall_avg by CA_typology (with Tukey HSD letters)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function to extract Tukey letters
get_letters <- function(df) {
  mod <- aov(ICR_rainfall_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_rainfall_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_rainfall_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICR_rainfall_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 7, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 6.2, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Rainfall (avg, mm)"
  ) +
  theme_minimal()

## CA Performance as affected by Comp_amount_corr_quali

NOtes: It seems to have a significant relation with both Y_relative and typology. Wether pooled all together of per environment, but when split by environment, all show the sam esign of trend, but is clear and significant only for mid-dry and transition Conclusions: Keep for the drivers analysis ###Simple regression: Comp_amount_corr_quali vs avg_Y_ratio

library(ggplot2)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                Comp_amount_corr_quali <= 15)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    x = "Compost amount (corrected, ≤15)",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()

###Linear regression per environment (facet + R² + p, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment (compost ≤ 15)",
    x = "Compost amount (corrected)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Boxplot by CA_typology (Tukey HSD, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(Comp_amount_corr_quali),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

get_letters <- function(df) {
  mod <- aov(Comp_amount_corr_quali ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(Comp_amount_corr_quali, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

ggplot(df_filt, aes(x = CA_typology, y = Comp_amount_corr_quali)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali by CA_typology (per environment)",
    subtitle = "Tukey HSD (compost ≤ 15)",
    x = "CA Typology",
    y = "Compost amount (corrected)"
  ) +
  theme_minimal()

## CA Performance as affected by ICF_planting_date

NOtes: Conclusions: ###Simple regression: ICR_rainfall_avg ICF_planting_date

library(ggplot2)
library(dplyr)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio))

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_planting_date, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()

###Linear regression per environment (facet + R^2 + p)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment))

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Boxplot by CA_typology (Tukey HSD)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(ICF_planting_date),
                !is.na(IQF_environment))

get_letters <- function(df) {
  mod <- aov(ICF_planting_date ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_planting_date, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))

ggplot(df_filt, aes(x = CA_typology, y = ICF_planting_date)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date by CA_typology (per environment)",
    subtitle = "Tukey HSD",
    x = "CA Typology",
    y = "Planting date"
  ) +
  theme_minimal()

###How does delay in planting relate to rain over the season?

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(ICF_planting_date),
    !is.na(ICR_rainfall_avg),
    !is.na(IQF_environment)
  )

# 2. Fit linear model per environment
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_rainfall_avg ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>% 
  dplyr::select(IQF_environment, stat_label)

# 3. Annotation position
label_y <- max(df_filt$ICR_rainfall_avg, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 4. Plot
ggplot(df_filt, aes(x = ICF_planting_date, y = ICR_rainfall_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 4,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. ICR_rainfall_avg",
    subtitle = "Linear regression per environment (p-value and R^2 shown)",
    x = "Planting date",
    y = "Rainfall average (mm)"
  ) +
  theme_minimal()

CA Performance as affected by slope of the plot

library(ggplot2)
library(dplyr)

# Filter data
df_filt <- drivers_data %>%
  filter(!is.na(Slope),
         !is.na(avg_Y_ratio))

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Slope, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = Slope, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE,
              color = "darkred", linetype = "dashed") +
  labs(
    title = "Slope vs. avg_Y_ratio",
    x = "Slope",
    y = "Average Yield Ratio"
  ) +
annotate("text",
         x = Inf, y = Inf,
         hjust = 1.1, vjust = 1.5,
         label = paste0("p = ", signif(p_val, 3),
                        "\nR^2 == ", round(r2, 2)),
         parse = TRUE,
         size = 5) +
  theme_minimal()

###Linear regression per environment using Slope

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  filter(!is.na(Slope),
         !is.na(avg_Y_ratio),
         !is.na(IQF_environment))

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ Slope, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Slope, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Slope, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE,
              color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Slope vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment",
    x = "Slope",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

### Now slopes per CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  filter(!is.na(CA_typology),
         !is.na(Slope),
         !is.na(IQF_environment))

#----------------------------------------------
# Function to extract Tukey letters
#----------------------------------------------
get_letters <- function(df) {
  mod <- aov(Slope ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#----------------------------------------------
# Compute letters and means
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(Slope, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

#----------------------------------------------
# Plot
#----------------------------------------------
ggplot(df_filt, aes(x = CA_typology, y = Slope)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6,
              color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Slope, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Slope, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "Slope by CA_typology (per environment)",
    subtitle = "ANOVA + Tukey HSD",
    x = "CA Typology",
    y = "Slope"
  ) +
  theme_minimal()

### What about slope cathegory and yield ratio

# First we add clope_catto drivers data
library(dplyr)

lookup_slope <- CONV_CA_Ratio %>%
  dplyr::select(IQR_block, slope_cat) %>%
  distinct(IQR_block, .keep_all = TRUE)

drivers_data <- drivers_data %>%
  left_join(lookup_slope, by = "IQR_block")

## Then boxes of avrage Y ratio of CA per slope cathegories by agzone
library(ggplot2)

# Make sure slope_cat is categorical
drivers_data$slope_cat <- as.factor(drivers_data$slope_cat)

ggplot(drivers_data, aes(x = slope_cat, y = avg_Y_ratio, fill = slope_cat)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ IQF_environment) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  ) +
  labs(
    x = "Slope Category",
    y = "Average Y Ratio",
    title = "Average Y Ratio by Slope Category across Environments"
  )

##Factorial variables ## CA Performance as affected by IQR_TF_tubura_client NOtes: Conclusions: ###IQR_TF_tubura_client with Y_ratio

library(ggplot2)
library(dplyr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_TF_tubura_client),
    !IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_TF_tubura_client = droplevels(IQR_TF_tubura_client))  # drop unused levels

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_test <- kruskal.test(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
kruskal_p <- kruskal_test$p.value

# 4. Build annotation label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Calculate group means
mean_df <- df_filt %>%
  group_by(IQR_TF_tubura_client) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Plot boxplot with p-values and means
ggplot(df_filt, aes(x = IQR_TF_tubura_client, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Means under each box
  geom_text(
    data = mean_df,
    aes(x = IQR_TF_tubura_client,
        y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
        label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # P-value annotation
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_TF_tubura_client",
    subtitle = "ANOVA and Kruskal–Wallis test results shown",
    x = "Tubura client status",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_TF_tubura_client with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_TF_tubura_client),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
  )

# Calculate N per category of explanatory variable
n_labels <- df_bar %>%
  group_by(IQR_TF_tubura_client) %>%
  summarise(n = n(), .groups = "drop")

# Plot with N annotations
ggplot(df_bar, aes(x = IQR_TF_tubura_client, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_TF_tubura_client, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Tubura client status",
    subtitle = "Proportions with total sample size per group",
    x = "IQR_TF_tubura_client",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# Build contingency table
tab <- table(df_bar$IQR_TF_tubura_client, df_bar$CA_typology)

# Run chi-squared test
chi_test <- chisq.test(tab)

# Print results
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 6, p-value = NA
# Fisher’s Exact Test (for categorical association)
fisher_test <- fisher.test(tab)

# Print results
fisher_test
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.673
## alternative hypothesis: two.sided

CA Performance as affected by IQR_plot_fert_quality

NOtes: Conclusions: ###IQR_plot_fert_quality with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_plot_fert_quality),
    !IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_plot_fert_quality = factor(IQR_plot_fert_quality, levels = c("low_quality", "medium", "high")))


# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Tukey HSD and Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_plot_fert_quality`[, "p adj"])$Letters

letters_df <- data.frame(
  IQR_plot_fert_quality = names(letters),
  Letters = letters,
  stringsAsFactors = FALSE
)

# 4. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)$p.value
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means and letters
labels_df <- left_join(means_df, letters_df, by = "IQR_plot_fert_quality")

# 7. Plot
ggplot(df_filt, aes(x = IQR_plot_fert_quality, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Add Tukey letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Add means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Add test label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_plot_fert_quality",
    subtitle = "Tukey HSD letters, means, and statistical tests",
    x = "Plot Fertility Quality",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_plot_fert_quality with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_plot_fert_quality),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality,
                                   levels = c("high", "medium", "low_quality"))  # Desired order
  )

# 2. Calculate N per category
n_labels <- df_bar %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot
ggplot(df_bar, aes(x = IQR_plot_fert_quality, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_plot_fert_quality, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Fertilizer Quality",
    subtitle = "Proportions with total sample size per group",
    x = "IQR_plot_fert_quality",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Chi-squared test
tab <- table(df_bar$IQR_plot_fert_quality, df_bar$CA_typology)
chi_test <- chisq.test(tab)
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 3.8727, df = 6, p-value = 0.6939

how does FErtility perceibed by farmers relate to researchers’ indicators of fertility

library(dplyr)
library(ggplot2)

fert_data <- drivers_data %>%
  filter(!is.na(IQR_plot_fert_quality)) %>%
  mutate(
    IQR_plot_fert_quality = droplevels(IQR_plot_fert_quality)
  )
## Paste the env_index

library(dplyr)

env_block_avg <- CONV_CA_Ratio %>%
  filter(!is.na(IQR_block),
         !is.na(env_index)) %>%
  group_by(IQR_block) %>%
  summarise(
    env_index_avg = mean(env_index, na.rm = TRUE),
    .groups = "drop"
  )

fert_data <- drivers_data %>%
  filter(!is.na(IQR_plot_fert_quality)) %>%
  mutate(
    IQR_plot_fert_quality = droplevels(IQR_plot_fert_quality)
  ) %>%
  left_join(env_block_avg, by = "IQR_block")

cat("Rows in fert_data:", nrow(fert_data), "\n")
## Rows in fert_data: 471
summary(fert_data$env_index_avg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3312  0.8015  0.9828  0.9967  1.1471  2.4047
# Check variables relation

# Environmetal index

library(dplyr)
library(ggplot2)

# ---------------------------------------
# 1️⃣ Ensure env_index_avg exists
# ---------------------------------------

fert_data <- fert_data %>%
  filter(!is.na(IQR_plot_fert_quality),
         !is.na(env_index_avg))

# ---------------------------------------
# 2️⃣ Summary statistics
# ---------------------------------------

summary_env <- fert_data %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(
    mean_val = mean(env_index_avg, na.rm = TRUE),
    N = n(),
    y_pos = max(env_index_avg, na.rm = TRUE) * 1.05,
    label = paste0("Mean = ", round(mean_val, 2),
                   "\nN = ", N),
    .groups = "drop"
  )

# ---------------------------------------
# 3️⃣ Plot
# ---------------------------------------

ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           y = env_index_avg,
           fill = IQR_plot_fert_quality)) +
  
  geom_boxplot(alpha = 0.6) +
  
  geom_jitter(width = 0.15,
              alpha = 0.3,
              color = "black") +
  
  geom_text(data = summary_env,
            aes(x = IQR_plot_fert_quality,
                y = y_pos,
                label = label),
            inherit.aes = FALSE,
            size = 4) +
  
  labs(
    title = "Environmental Index vs Farmer Perceived Soil Fertility",
    x = "Farmer Perceived Soil Fertility",
    y = "Average Environmental Index (across seasons)"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# SOC
library(dplyr)
library(ggplot2)

fert_data <- drivers_data %>%
  filter(!is.na(IQR_plot_fert_quality),
         !is.na(ICR_Org_C_avg))

summary_OC <- fert_data %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(
    mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
    N = n(),
    y_pos = max(ICR_Org_C_avg, na.rm = TRUE) * 1.05,
    label = paste0("Mean=", round(mean_val, 2),
                   "\nN=", N),
    .groups = "drop"
  )

# Plot
ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           y = ICR_Org_C_avg,
           fill = IQR_plot_fert_quality)) +
  
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.15, alpha = 0.3) +
  
  geom_text(data = summary_OC,
            aes(x = IQR_plot_fert_quality,
                y = y_pos,
                label = label),
            inherit.aes = FALSE,
            size = 4) +
  
  labs(
    title = "Organic Carbon vs Farmer Perception",
    x = "Farmer Perceived Soil Fertility",
    y = "Organic Carbon"
  ) +
  
  theme_minimal() +
  theme(legend.position = "none")

##P


# Summary
summary_P <- fert_data %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(
    mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
    N = n(),
    y_pos = max(ICR_Avail_P_avg, na.rm = TRUE) * 1.05,
    label = paste0("Mean=", round(mean_val, 1),
                   "\nN=", N),
    .groups = "drop"
  )

ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           y = ICR_Avail_P_avg,
           fill = IQR_plot_fert_quality)) +
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.15, alpha = 0.3) +
  geom_text(data = summary_P,
            aes(x = IQR_plot_fert_quality,
                y = y_pos,
                label = label),
            inherit.aes = FALSE,
            size = 4) +
  theme_minimal() +
  theme(legend.position = "none")

##K

summary_K <- fert_data %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(
    mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
    N = n(),
    y_pos = max(ICR_K_perc_23A, na.rm = TRUE) * 1.05,
    label = paste0("Mean = ", round(mean_val, 3),
                   "\nN = ", N),
    .groups = "drop"
  )

# ---------------------------------------
# 3️⃣ Plot
# ---------------------------------------

ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           y = ICR_K_perc_23A,
           fill = IQR_plot_fert_quality)) +
  
  geom_boxplot(alpha = 0.6) +
  
  geom_jitter(width = 0.15,
              alpha = 0.3,
              color = "black") +
  
  geom_text(data = summary_K,
            aes(x = IQR_plot_fert_quality,
                y = y_pos,
                label = label),
            inherit.aes = FALSE,
            size = 4) +
  
  labs(
    title = "Potassium (ICR_K_perc_23A) vs Farmer Perceived Soil Fertility",
    x = "Farmer Perceived Soil Fertility",
    y = "Potassium (%)"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# Soil texture

ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           fill = IQR_soil_texture)) +
  geom_bar(position = "fill") +
  labs(
    title = "Soil Texture Distribution by Farmer Perception",
    x = "Farmer Perception",
    y = "Proportion",
    fill = "Soil Texture"
  ) +
  theme_minimal()

chisq.test(table(fert_data$IQR_plot_fert_quality,
                 fert_data$IQR_soil_texture))
## 
##  Pearson's Chi-squared test
## 
## data:  table(fert_data$IQR_plot_fert_quality, fert_data$IQR_soil_texture)
## X-squared = NaN, df = 9, p-value = NA
# Soil color
ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           fill = IQR_soil_color)) +
  geom_bar(position = "fill") +
  labs(
    title = "Soil Color by Farmer Perception",
    y = "Proportion"
  ) +
  theme_minimal()

# Slope position

ggplot(fert_data,
       aes(x = IQR_plot_fert_quality,
           fill = IQF_position_slope)) +
  geom_bar(position = "fill") +
  labs(
    title = "Slope Position by Farmer Perception",
    y = "Proportion"
  ) +
  theme_minimal()

## CA Performance as affected by IQR_soil_texture NOtes: Conclusions: ###IQR_soil_texture with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_texture),
    !IQR_soil_texture %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_soil_texture = droplevels(IQR_soil_texture))

# Optional: Reorder texture levels manually (if desired)
# Example order: "sand", "loam", "clay", etc.
df_filt <- df_filt %>%
  mutate(IQR_soil_texture = factor(IQR_soil_texture, levels = c("Fine", "Mix", "Coarse")))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)$p.value

# 4. Tukey HSD and compact letter display
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_texture`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_texture = names(letters), Letters = letters)

# 5. Compute group means
means_df <- df_filt %>%
  group_by(IQR_soil_texture) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_texture")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_texture, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Statistical tests label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_texture",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Texture",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_soil_texture with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_soil_texture),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_soil_texture %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_soil_texture = droplevels(as.factor(IQR_soil_texture))) %>%
  mutate(IQR_soil_texture = factor(IQR_soil_texture ,
                                   levels = c("Fine", "Mix", "Coarse")))
    # Desired order

# 2. Calculate N per level of explanatory variable
n_labels <- df_bar %>%
  group_by(IQR_soil_texture) %>%
  summarise(n = n(), .groups = "drop")

# 3. Bar plot
ggplot(df_bar, aes(x = IQR_soil_texture, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_soil_texture, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Soil Texture",
    subtitle = "Proportions with total sample size per texture class",
    x = "IQR_soil_texture",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$IQR_soil_texture, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.0014, df = 6, p-value = 0.423

CA Performance as affected by IQR_soil_color

NOtes: Conclusions: ###IQR_soil_color with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_color),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  ) %>%
  mutate(IQR_soil_color = droplevels(IQR_soil_color))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = df_filt)$p.value

# 4. Tukey HSD and letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_color`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_color = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_soil_color) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters and means
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_color")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_color",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Color",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

### Soil color facet by environment

library(ggplot2)
library(dplyr)
library(multcompView)

# -------------------------------------------------
# 1️⃣ Filter and clean
# -------------------------------------------------

df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_color),
    !is.na(IQF_environment),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  ) %>%
  mutate(IQR_soil_color = droplevels(factor(IQR_soil_color)))

# -------------------------------------------------
# 2️⃣ Run ANOVA + Kruskal + Tukey per environment
# -------------------------------------------------

results_env <- df_filt %>%
  group_by(IQF_environment) %>%
  group_modify(~{

    # Count number of soil color groups
    n_groups <- length(unique(.x$IQR_soil_color))

    # Compute group means (always)
    means_df <- .x %>%
      group_by(IQR_soil_color) %>%
      summarise(
        mean_val = mean(avg_Y_ratio, na.rm = TRUE),
        .groups = "drop"
      ) %>%
      mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

    # If fewer than 2 groups → skip tests
    if (n_groups < 2) {
      means_df$Letters <- ""
      means_df$anova_p <- NA
      means_df$kruskal_p <- NA
      return(means_df)
    }

    # ---------------- ANOVA ----------------
    anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = .x)
    anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

    # ---------------- Kruskal ----------------
    kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = .x)$p.value

    # ---------------- Tukey (robust) ----------------
    tukey <- TukeyHSD(anova_model)
    tukey_mat <- tukey$IQR_soil_color

    if (is.null(tukey_mat) || nrow(tukey_mat) == 0) {

      means_df$Letters <- ""

    } else {

      tukey_vec <- tukey_mat[, "p adj"]

      # Ensure names exist
      if (is.null(names(tukey_vec))) {
        names(tukey_vec) <- rownames(tukey_mat)
      }

      letters <- multcompLetters(tukey_vec)$Letters

      letters_df <- data.frame(
        IQR_soil_color = names(letters),
        Letters = letters,
        stringsAsFactors = FALSE
      )

      means_df <- left_join(means_df, letters_df,
                            by = "IQR_soil_color")
    }

    means_df$anova_p <- anova_p
    means_df$kruskal_p <- kruskal_p

    means_df
  }) %>%
  ungroup()

# -------------------------------------------------
# 3️⃣ Create p-value labels per environment
# -------------------------------------------------

test_labels <- results_env %>%
  distinct(IQF_environment, anova_p, kruskal_p) %>%
  mutate(
    test_label = paste0(
      "ANOVA p = ", signif(anova_p, 3),
      "\nKruskal p = ", signif(kruskal_p, 3)
    )
  )

# -------------------------------------------------
# 4️⃣ Plot
# -------------------------------------------------

ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +

  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Tukey Letters
  geom_text(
    data = results_env,
    aes(
      x = IQR_soil_color,
      y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.05,
      label = Letters
    ),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 5,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = results_env,
    aes(
      x = IQR_soil_color,
      y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
      label = mean_label
    ),
    inherit.aes = FALSE,
    size = 4
  ) +

  # P-values
  geom_text(
    data = test_labels,
    aes(x = Inf, y = Inf, label = test_label),
    hjust = 1.1,
    vjust = 1.5,
    inherit.aes = FALSE,
    size = 4
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "avg_Y_ratio by IQR_soil_color",
    subtitle = "Facet by Environment | Tukey letters + ANOVA + Kruskal–Wallis",
    x = "Soil Color",
    y = "Average Yield Ratio"
  ) +

  theme_minimal()

###IQR_soil_color with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_soil_color),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  )

# 2. Calculate N per category
n_labels <- df_bar %>%
  group_by(IQR_soil_color) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot
ggplot(df_bar, aes(x = IQR_soil_color, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_soil_color, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Soil Color",
    subtitle = "Proportional bars with total sample size",
    x = "Soil Color",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Chi-squared and Fisher’s test
tab <- table(df_bar$IQR_soil_color, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 15, p-value = NA
# Fisher’s Exact Test (with simulation to avoid memory issues)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.6496
## alternative hypothesis: two.sided

CA Performance as affected by IQF_position_slope

NOtes: Conclusions: ###IQF_position_slope with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_position_slope),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_position_slope = droplevels(IQF_position_slope))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_position_slope`[, "p adj"])$Letters
letters_df <- data.frame(IQF_position_slope = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_position_slope) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQF_position_slope")

# 7. Statistical test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Tukey HSD letters (above)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means (below)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_position_slope",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Slope Position",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Facet by environment

library(ggplot2)
library(dplyr)
library(multcompView)

# -------------------------------------------------
# 1️⃣ Filter and clean
# -------------------------------------------------

df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_position_slope),
    !is.na(IQF_environment),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_position_slope = droplevels(factor(IQF_position_slope)))

# -------------------------------------------------
# 2️⃣ Run ANOVA + Kruskal + Tukey per environment
# -------------------------------------------------

results_env <- df_filt %>%
  group_by(IQF_environment) %>%
  group_modify(~{

    n_groups <- length(unique(.x$IQF_position_slope))

    # Means
    means_df <- .x %>%
      group_by(IQF_position_slope) %>%
      summarise(
        mean_val = mean(avg_Y_ratio, na.rm = TRUE),
        .groups = "drop"
      ) %>%
      mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

    if (n_groups < 2) {
      means_df$Letters <- ""
      means_df$anova_p <- NA
      means_df$kruskal_p <- NA
      return(means_df)
    }

    # ANOVA
    anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = .x)
    anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

    # Kruskal
    kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = .x)$p.value

    # Tukey (robust)
    tukey <- TukeyHSD(anova_model)
    tukey_mat <- tukey$IQF_position_slope

    if (is.null(tukey_mat) || nrow(tukey_mat) == 0) {

      means_df$Letters <- ""

    } else {

      tukey_vec <- tukey_mat[, "p adj"]

      if (is.null(names(tukey_vec))) {
        names(tukey_vec) <- rownames(tukey_mat)
      }

      letters <- multcompLetters(tukey_vec)$Letters

      letters_df <- data.frame(
        IQF_position_slope = names(letters),
        Letters = letters,
        stringsAsFactors = FALSE
      )

      means_df <- left_join(means_df, letters_df,
                            by = "IQF_position_slope")
    }

    means_df$anova_p <- anova_p
    means_df$kruskal_p <- kruskal_p

    means_df
  }) %>%
  ungroup()

# -------------------------------------------------
# 3️⃣ Create p-value labels per environment
# -------------------------------------------------

test_labels <- results_env %>%
  distinct(IQF_environment, anova_p, kruskal_p) %>%
  mutate(
    test_label = paste0(
      "ANOVA p = ", signif(anova_p, 3),
      "\nKruskal p = ", signif(kruskal_p, 3)
    )
  )

# -------------------------------------------------
# 4️⃣ Plot
# -------------------------------------------------

ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +

  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Tukey Letters
  geom_text(
    data = results_env,
    aes(
      x = IQF_position_slope,
      y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.05,
      label = Letters
    ),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 5,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = results_env,
    aes(
      x = IQF_position_slope,
      y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
      label = mean_label
    ),
    inherit.aes = FALSE,
    size = 4
  ) +

  # P-values
  geom_text(
    data = test_labels,
    aes(x = Inf, y = Inf, label = test_label),
    hjust = 1.1,
    vjust = 1.5,
    inherit.aes = FALSE,
    size = 4
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "avg_Y_ratio by IQF_position_slope",
    subtitle = "Facet by Environment | Tukey letters + ANOVA + Kruskal–Wallis",
    x = "Slope Position",
    y = "Average Yield Ratio"
  ) +

  theme_minimal()

###IQF_position_slope with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQF_position_slope),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  )

# 2. Calculate total N per slope category
n_labels <- df_bar %>%
  group_by(IQF_position_slope) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot proportion bar chart
ggplot(df_bar, aes(x = IQF_position_slope, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQF_position_slope, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Position on Slope",
    subtitle = "Proportions and total N per slope category",
    x = "Slope Position",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Run statistical tests
tab <- table(df_bar$IQF_position_slope, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 18, p-value = NA
# Fisher’s Exact Test with simulation fallback
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.1501
## alternative hypothesis: two.sided

CA Performance as affected by Weed_species_A_combined

NOtes: Conclusions: ###Weed_species_A_combined with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(Weed_species_A_combined),
    !Weed_species_A_combined %in% c("no data", "-", "—", "NA")
  ) %>%
  mutate(Weed_species_A_combined = droplevels(Weed_species_A_combined))


# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis
kruskal_p <- kruskal.test(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`Weed_species_A_combined`[, "p adj"])$Letters
letters_df <- data.frame(Weed_species_A_combined = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(Weed_species_A_combined) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "Weed_species_A_combined")

# 7. Combine test labels
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = Weed_species_A_combined, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by Weed_species_A_combined",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Weed Species (Combined)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Weed_species_A_combined with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Filter and clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(Weed_species_A_combined),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !Weed_species_A_combined %in% c("no data", "—", "", "NA")
  )

# 2. Total N per weed category
n_labels <- df_bar %>%
  group_by(Weed_species_A_combined) %>%
  summarise(n = n(), .groups = "drop")

# 3. Proportional bar plot
ggplot(df_bar, aes(x = Weed_species_A_combined, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = Weed_species_A_combined, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Dominant Weed Species",
    subtitle = "Stacked bars show % CA_typology, with N on top",
    x = "Weed Species Group (A combined)",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$Weed_species_A_combined, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 12, p-value = NA
# Fisher’s Exact Test with simulated p-value (for large tables)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.3741
## alternative hypothesis: two.sided

CA Performance as affected by IQF_environment

NOtes: Conclusions: ###IQF_environment with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_environment),
    !IQF_environment %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_environment = droplevels(factor(IQF_environment)))
df_filt <- df_filt %>%
  mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_environment, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_environment, data = df_filt)$p.value

# 4. Tukey HSD + Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_environment`[, "p adj"])$Letters
letters_df <- data.frame(IQF_environment = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_environment) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters + means
labels_df <- left_join(means_df, letters_df, by = "IQF_environment")

# 7. Test summary
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_environment, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_environment",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Environment Type",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQF_environment with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQF_environment),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQF_environment %in% c("no data", "---", "", "NA")
  )
df_bar <- df_bar %>%
  mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))

# 2. Total N per environment
n_labels <- df_bar %>%
  group_by(IQF_environment) %>%
  summarise(n = n(), .groups = "drop")

# 3. Proportional bar plot
ggplot(df_bar, aes(x = IQF_environment, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQF_environment, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology Distribution by Environment",
    subtitle = "Proportional bars with total N per environment",
    x = "IQF Environment",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$IQF_environment, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 25.835, df = 9, p-value = 0.002174
# Fisher’s Exact Test with simulation
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.00761
## alternative hypothesis: two.sided

We evaluate some variables that where initially dropped

Total residues, Termite insidence, couch grass as weed and Erosion reduction ### Exploration of Total green residues at harvest variable

library(dplyr)

# Step 1: Calculate total unique blocks per season
block_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    total_blocks = n_distinct(IQR_block),
    
    blocks_with_residues_gt0 = n_distinct(IQR_block[ICR_total_residues_tn_ha > 0]),
    
    blocks_with_residues_nonNA = n_distinct(IQR_block[!is.na(ICR_total_residues_tn_ha)]),
    
    .groups = "drop"
  ) %>%
  mutate(
    perc_gt0 = 100 * blocks_with_residues_gt0 / total_blocks,
    perc_nonNA = 100 * blocks_with_residues_nonNA / total_blocks
  )

# View the result
print(block_summary)
## # A tibble: 6 × 6
##   IQR_Season total_blocks blocks_with_residues…¹ blocks_with_residues…² perc_gt0
##   <chr>             <int>                  <int>                  <int>    <dbl>
## 1 23A                 538                    537                    536     99.8
## 2 23B                 499                    234                    499     46.9
## 3 24A                 449                    419                    449     93.3
## 4 24B                 463                    461                    463     99.6
## 5 25A                 428                    427                    428     99.8
## 6 25B                 441                    441                    441    100  
## # ℹ abbreviated names: ¹​blocks_with_residues_gt0, ²​blocks_with_residues_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
library(dplyr)

residue_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    count = sum(!is.na(ICR_total_residues_tn_ha)),
    min = min(ICR_total_residues_tn_ha, na.rm = TRUE),
    q25 = quantile(ICR_total_residues_tn_ha, 0.25, na.rm = TRUE),
    median = median(ICR_total_residues_tn_ha, na.rm = TRUE),
    mean = mean(ICR_total_residues_tn_ha, na.rm = TRUE),
    q75 = quantile(ICR_total_residues_tn_ha, 0.75, na.rm = TRUE),
    max = max(ICR_total_residues_tn_ha, na.rm = TRUE),
    sd = sd(ICR_total_residues_tn_ha, na.rm = TRUE)
  ) %>%
  arrange(IQR_Season)

print(residue_summary)
## # A tibble: 6 × 9
##   IQR_Season count    min    q25 median   mean    q75    max     sd
##   <chr>      <int>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 23A          536 3      11.7    15.9  16.7   20.0   139.    8.37 
## 2 23B          499 0       0       0     0.374  0.670   2.82  0.520
## 3 24A          449 0       8      16.3  17.1   24.5   189.   13.8  
## 4 24B          463 0       0.6     1.10  1.87   1.86  124.    5.94 
## 5 25A          428 0      12.0    20.2  23.4   32.0   141.   15.8  
## 6 25B          441 0.0472  0.675   1.06  1.70   1.73  174.    8.27
library(ggplot2)

ggplot(CONV_CA_Ratio, aes(x = IQR_Season, y = ICR_total_residues_tn_ha)) +
  geom_boxplot(outlier.color = "red", fill = "skyblue") +
  labs(title = "Distribution of Total Residues by Season",
       y = "ICR_total_residues_tn_ha", x = "IQR_Season") +
  theme_minimal()

Exploration of Total perceived mulch at planting variable

library(dplyr)

block_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    total_blocks = n_distinct(IQR_block),
    
    blocks_with_mulch_gt0 = n_distinct(IQR_block[ICR_mulch_perc_planti > 0]),
    
    blocks_with_mulch_nonNA = n_distinct(IQR_block[!is.na(ICR_mulch_perc_planti)]),
    
    .groups = "drop"
  ) %>%
  mutate(
    perc_gt0 = 100 * blocks_with_mulch_gt0 / total_blocks,
    perc_nonNA = 100 * blocks_with_mulch_nonNA / total_blocks
  )

print(block_summary)
## # A tibble: 6 × 6
##   IQR_Season total_blocks blocks_with_mulch_gt0 blocks_with_mulch_nonNA perc_gt0
##   <chr>             <int>                 <int>                   <int>    <dbl>
## 1 23A                 538                     0                     538      0  
## 2 23B                 499                   479                     499     96.0
## 3 24A                 449                   374                     449     83.3
## 4 24B                 463                   446                     463     96.3
## 5 25A                 428                   308                     426     72.0
## 6 25B                 441                   430                     429     97.5
## # ℹ 1 more variable: perc_nonNA <dbl>
library(dplyr)

mulch_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    count = sum(!is.na(ICR_mulch_perc_planti)),
    min = min(ICR_mulch_perc_planti, na.rm = TRUE),
    q25 = quantile(ICR_mulch_perc_planti, 0.25, na.rm = TRUE),
    median = median(ICR_mulch_perc_planti, na.rm = TRUE),
    mean = mean(ICR_mulch_perc_planti, na.rm = TRUE),
    q75 = quantile(ICR_mulch_perc_planti, 0.75, na.rm = TRUE),
    max = max(ICR_mulch_perc_planti, na.rm = TRUE),
    sd = sd(ICR_mulch_perc_planti, na.rm = TRUE)
  ) %>%
  arrange(IQR_Season)

print(mulch_summary)
## # A tibble: 6 × 9
##   IQR_Season count   min   q25 median  mean   q75   max    sd
##   <chr>      <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A          538     0   0        0   0       0     0   0  
## 2 23B          499     0  46.5     65  56.2    75   162  28.2
## 3 24A          449     0  15       30  28.4    40   100  22.4
## 4 24B          463     0  30       50  49.6    70   100  26.9
## 5 25A          426     0   0       20  22.5    35    80  20.3
## 6 25B          429     5  40       60  57.3    75   100  22.0
library(ggplot2)

ggplot(CONV_CA_Ratio, aes(x = IQR_Season, y = ICR_mulch_perc_planti)) +
  geom_boxplot(outlier.color = "red", fill = "skyblue") +
  labs(title = "Distribution of Mulch per Plant by Season",
       y = "ICR_mulch_per_planti",
       x = "IQR_Season") +
  theme_minimal()

model <- lm(Y_ratio ~ ICR_mulch_perc_planti,
            data = CONV_CA_Ratio,
            na.action = na.omit)

summary(model)
## 
## Call:
## lm(formula = Y_ratio ~ ICR_mulch_perc_planti, data = CONV_CA_Ratio, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7573 -0.2142 -0.0365  0.1512  3.3184 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.8018102  0.0101032  79.362   <2e-16 ***
## ICR_mulch_perc_planti 0.0003911  0.0002177   1.796   0.0725 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3523 on 2802 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.00115,    Adjusted R-squared:  0.000794 
## F-statistic: 3.227 on 1 and 2802 DF,  p-value: 0.07252
library(ggplot2)

library(ggplot2)

library(dplyr)
library(ggplot2)

data_filt <- CONV_CA_Ratio %>%
  filter(year != 23)

ggplot(data_filt,
       aes(x = ICR_mulch_perc_planti,
           y = Y_ratio,
           color = crop)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Yield Ratio vs Mulch Percentage per Plant by Crop (Year 23 removed)",
    x = "ICR_mulch_perc_planti",
    y = "Y_ratio",
    color = "Crop"
  ) +
  theme_minimal()

### Is there an optimum much cover? Enough to reach close to plateau NO, it gous up utill 100% coverage

library(dplyr)
library(ggplot2)
library(segmented)
library(purrr)

# 1️⃣ Clean data (remove year 23 if desired)
data_filt <- CONV_CA_Ratio %>%
  filter(year != 23,
         !is.na(Y_ratio),
         !is.na(ICR_mulch_perc_planti),
         !is.na(crop))

# 2️⃣ Fit segmented model per crop
seg_models <- data_filt %>%
  group_by(crop) %>%
  group_modify(~{
    
    # Linear model first
    lm_mod <- lm(Y_ratio ~ ICR_mulch_perc_planti, data = .x)
    
    # Segmented model (starting guess at 30%)
    seg_mod <- tryCatch(
      segmented(lm_mod,
                seg.Z = ~ ICR_mulch_perc_planti,
                psi = list(ICR_mulch_perc_planti = 30)),
      error = function(e) NULL
    )
    
    if (is.null(seg_mod)) {
      .x$pred <- predict(lm_mod)
      .x$breakpoint <- NA
    } else {
      .x$pred <- predict(seg_mod)
      .x$breakpoint <- seg_mod$psi[2]
    }
    
    .x
  }) %>%
  ungroup()

# 3️⃣ Extract breakpoint per crop
breakpoints <- seg_models %>%
  group_by(crop) %>%
  summarise(breakpoint = first(breakpoint), .groups = "drop")

print(breakpoints)
## # A tibble: 2 × 2
##   crop  breakpoint
##   <chr>      <dbl>
## 1 Beans       15.0
## 2 Maize       30.0
# 4️⃣ Plot with facet by crop
ggplot(seg_models,
       aes(x = ICR_mulch_perc_planti,
           y = Y_ratio)) +
  
  geom_point(alpha = 0.4, color = "steelblue") +
  
  geom_line(aes(y = pred),
            color = "red",
            size = 1.2) +
  
  facet_wrap(~ crop, scales = "free") +
  
  labs(
    title = "Yield Ratio vs Mulch Percentage",
    subtitle = "Segmented (Plateau) Model per Crop",
    x = "Mulch (%)",
    y = "Y_ratio"
  ) +
  
  theme_minimal(base_size = 14)

Add mulch coverage at planting to data_mm_block databse

We do average of season 24A,B and 25A and B then by Blick ID

library(dplyr)

# 1. Calculate block-level averages
mulch_block_avg <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% c("24A", "24B", "25A", "25B")) %>%
  group_by(IQR_block) %>%
  summarise(
    avg_ICR_mulch_perc_planti = as.numeric(
      mean(ICR_mulch_perc_planti, na.rm = TRUE)
    ),
    .groups = "drop"
  )

# 2. Join into drivers_data
drivers_data <- drivers_data %>%
  left_join(mulch_block_avg, by = "IQR_block")

Mulch ICR_mulch_thicknesscm_planting

library(dplyr)

block_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    total_blocks = n_distinct(IQR_block),
    
    blocks_with_mulch_gt0 = n_distinct(IQR_block[ICR_mulch_thicknesscm_planting > 0]),
    
    blocks_with_mulch_nonNA = n_distinct(IQR_block[!is.na(ICR_mulch_thicknesscm_planting)]),
    
    .groups = "drop"
  ) %>%
  mutate(
    perc_gt0 = 100 * blocks_with_mulch_gt0 / total_blocks,
    perc_nonNA = 100 * blocks_with_mulch_nonNA / total_blocks
  )

print(block_summary)
## # A tibble: 6 × 6
##   IQR_Season total_blocks blocks_with_mulch_gt0 blocks_with_mulch_nonNA perc_gt0
##   <chr>             <int>                 <int>                   <int>    <dbl>
## 1 23A                 538                     0                     538    0    
## 2 23B                 499                     1                       0    0.200
## 3 24A                 449                   323                     449   71.9  
## 4 24B                 463                     1                       0    0.216
## 5 25A                 428                     1                       0    0.234
## 6 25B                 441                     1                       0    0.227
## # ℹ 1 more variable: perc_nonNA <dbl>
library(ggplot2)

ggplot(CONV_CA_Ratio,
       aes(x = IQR_Season,
           y = ICR_mulch_thicknesscm_planting)) +
  geom_boxplot(outlier.color = "red", fill = "skyblue") +
  labs(
    title = "Distribution of Mulch Thickness (cm) at Planting by Season",
    y = "ICR_mulch_thicknesscm_planting",
    x = "IQR_Season"
  ) +
  theme_minimal()

library(dplyr)
library(ggplot2)

data_filt <- CONV_CA_Ratio %>%
  filter(!IQR_Season %in% c("23A", "23B", "24B", "25A", "25B"))


ggplot(data_filt,
       aes(x = ICR_mulch_thicknesscm_planting,
           y = Y_ratio,
           color = crop)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Yield Ratio vs Mulch Thickness (cm) at Planting by Crop (Year 23 removed)",
    x = "ICR_mulch_thicknesscm_planting",
    y = "Y_ratio",
    color = "Crop"
  ) +
  theme_minimal()

data_filt2 <- CONV_CA_Ratio %>%
  filter(!IQR_Season %in% c("23A", "23B", "24B", "25A", "25B"))


ggplot(data_filt2,
       aes(x = ICR_mulch_thicknesscm_planting,
           y = ICR_mulch_perc_planti,
           color = crop)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "% Soil mulched vs Mulch Thickness (cm) at Planting by Crop (Year 23 removed)",
    x = "ICR_mulch_thicknesscm_planting",
    y = "% Soil mulched",
    color = "Crop"
  ) +
  theme_minimal()

Termites

library(dplyr)

block_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    total_blocks = n_distinct(IQR_block),
    
    blocks_with_termite_gt0 = n_distinct(IQR_block[ICR_Termite_incidence > 0]),
    
    blocks_with_termite_nonNA = n_distinct(IQR_block[!is.na(ICR_Termite_incidence)]),
    
    .groups = "drop"
  ) %>%
  mutate(
    perc_gt0 = 100 * blocks_with_termite_gt0 / total_blocks,
    perc_nonNA = 100 * blocks_with_termite_nonNA / total_blocks
  )

print(block_summary)
## # A tibble: 6 × 6
##   IQR_Season total_blocks blocks_with_termite_…¹ blocks_with_termite_…² perc_gt0
##   <chr>             <int>                  <int>                  <int>    <dbl>
## 1 23A                 538                      0                    538    0    
## 2 23B                 499                      1                     16    0.200
## 3 24A                 449                      1                    449    0.223
## 4 24B                 463                     75                    463   16.2  
## 5 25A                 428                      2                    428    0.467
## 6 25B                 441                      1                      0    0.227
## # ℹ abbreviated names: ¹​blocks_with_termite_gt0, ²​blocks_with_termite_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
termite_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    count = sum(!is.na(ICR_Termite_incidence)),
    min = min(ICR_Termite_incidence, na.rm = TRUE),
    q25 = quantile(ICR_Termite_incidence, 0.25, na.rm = TRUE),
    median = median(ICR_Termite_incidence, na.rm = TRUE),
    mean = mean(ICR_Termite_incidence, na.rm = TRUE),
    q75 = quantile(ICR_Termite_incidence, 0.75, na.rm = TRUE),
    max = max(ICR_Termite_incidence, na.rm = TRUE),
    sd = sd(ICR_Termite_incidence, na.rm = TRUE)
  ) %>%
  arrange(IQR_Season)

print(termite_summary)
## # A tibble: 6 × 9
##   IQR_Season count   min   q25 median      mean   q75     max     sd
##   <chr>      <int> <dbl> <dbl>  <dbl>     <dbl> <dbl>   <dbl>  <dbl>
## 1 23A          538     0     0      0   0           0    0     0    
## 2 23B           16     0     0      0   0           0    0     0    
## 3 24A          449     0     0      0   0.00548     0    2.46  0.116
## 4 24B          463     0     0      0   0.162       0    1     0.369
## 5 25A          428     0     0      0   0.0444      0   15     0.750
## 6 25B            0   Inf    NA     NA NaN          NA -Inf    NA

How termites relate to Y_ratio

There is not clear relation

library(dplyr)
library(ggplot2)

# -------------------------------------------------
# 1️⃣ Clean main dataset FIRST
# -------------------------------------------------

CONV_CA_Ratio_clean <- CONV_CA_Ratio %>%
  filter(
    !is.na(Y_ratio),
    !is.na(IQR_Season),
    !is.na(IQR_block)
  ) %>%
  mutate(
    ICR_Termite_incidence = as.numeric(ICR_Termite_incidence),
    ICR_Termite_incidence = ifelse(is.na(ICR_Termite_incidence), 0, ICR_Termite_incidence),
    ICR_mulch_perc_planti = as.numeric(ICR_mulch_perc_planti)  
  )

# -------------------------------------------------
# 2️⃣ Apply 24B termite replacement by block
# -------------------------------------------------

termite_24B <- CONV_CA_Ratio_clean %>%
  filter(IQR_Season == "24B") %>%
  dplyr::select(IQR_block, ICR_24B = ICR_Termite_incidence)

plot_data <- CONV_CA_Ratio_clean %>%
  left_join(termite_24B, by = "IQR_block") %>%
  mutate(
    ICR_Termite_incidence = ifelse(
      IQR_Season == "24B",
      ICR_Termite_incidence,
      ICR_24B
    ),
    
    # 🔴 CRITICAL FIX
    ICR_Termite_incidence = coalesce(ICR_Termite_incidence, 0),
    
    Termite_group = ifelse(ICR_Termite_incidence > 0,
                           "Termite > 0",
                           "Termite = 0"),
    
    Termite_group = factor(Termite_group,
                           levels = c("Termite = 0", "Termite > 0"))
  ) %>%
  dplyr::select(IQR_block, IQR_Season, Y_ratio,
         ICR_Termite_incidence, Termite_group, ICR_mulch_perc_planti)

# -------------------------------------------------
# 3️⃣ Summary statistics
# -------------------------------------------------

summary_stats <- plot_data %>%
  group_by(IQR_Season, Termite_group) %>%
  summarise(
    mean_Y = mean(Y_ratio),
    N = n(),
    .groups = "drop"
  )

# -------------------------------------------------
# 4️⃣ T-tests per season
# -------------------------------------------------

t_test_results <- plot_data %>%
  group_by(IQR_Season) %>%
  summarise(
    p_value = tryCatch(
      t.test(Y_ratio ~ Termite_group)$p.value,
      error = function(e) NA_real_
    ),
    y_pos = max(Y_ratio) * 1.05,
    .groups = "drop"
  )

# -------------------------------------------------
# 5️⃣ Plot
# -------------------------------------------------

ggplot(plot_data,
       aes(x = Termite_group,
           y = Y_ratio,
           fill = Termite_group)) +
  
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.3) +
  
  geom_text(data = summary_stats,
            aes(y = mean_Y,
                label = paste0("Mean=", round(mean_Y,2),
                               "\nN=", N)),
            vjust = -1,
            size = 3) +
  
  geom_text(data = t_test_results,
            aes(x = 1.5,
                y = y_pos,
                label = paste0("p = ", signif(p_value, 3))),
            inherit.aes = FALSE,
            size = 4) +
  
  facet_wrap(~ IQR_Season) +
  
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  ) +
  
  labs(
    title = "Y_ratio by Termite Incidence Across Seasons",
    x = "Termite Incidence",
    y = "Y_ratio"
  )

RElation of mulch and termites

Consistently, plots with termite have more mulch. Then while mulch related positively with CA performance, this may mask the negative effect of termites

library(dplyr)
library(ggplot2)

# -------------------------------------------------
# 1️⃣ Clean dataset
# -------------------------------------------------

CONV_CA_Ratio_clean <- CONV_CA_Ratio %>%
  filter(
    !is.na(ICR_mulch_perc_planti),
    !is.na(IQR_Season),
    !is.na(IQR_block)
  ) %>%
  mutate(
    ICR_Termite_incidence = as.numeric(ICR_Termite_incidence),
    ICR_Termite_incidence = ifelse(is.na(ICR_Termite_incidence), 0, ICR_Termite_incidence),
    ICR_mulch_perc_planti = as.numeric(ICR_mulch_perc_planti)
  )

# -------------------------------------------------
# 2️⃣ Apply 24B termite replacement
# -------------------------------------------------

termite_24B <- CONV_CA_Ratio_clean %>%
  filter(IQR_Season == "24B") %>%
  dplyr::select(IQR_block, ICR_24B = ICR_Termite_incidence)

plot_data <- CONV_CA_Ratio_clean %>%
  left_join(termite_24B, by = "IQR_block") %>%
  mutate(
    ICR_Termite_incidence = ifelse(
      IQR_Season == "24B",
      ICR_Termite_incidence,
      ICR_24B
    ),
    ICR_Termite_incidence = coalesce(ICR_Termite_incidence, 0),
    Termite_group = ifelse(ICR_Termite_incidence > 0,
                           "Termite > 0",
                           "Termite = 0"),
    Termite_group = factor(Termite_group,
                           levels = c("Termite = 0", "Termite > 0"))
  ) %>%
  dplyr::select(IQR_block, IQR_Season,
         ICR_mulch_perc_planti,
         Termite_group)

# -------------------------------------------------
# 3️⃣ Summary statistics (Mulch)
# -------------------------------------------------

summary_stats <- plot_data %>%
  group_by(IQR_Season, Termite_group) %>%
  summarise(
    mean_mulch = mean(ICR_mulch_perc_planti),
    N = n(),
    .groups = "drop"
  )

# -------------------------------------------------
# 4️⃣ T-tests per season (Mulch)
# -------------------------------------------------

t_test_results <- plot_data %>%
  group_by(IQR_Season) %>%
  summarise(
    p_value = tryCatch(
      t.test(ICR_mulch_perc_planti ~ Termite_group)$p.value,
      error = function(e) NA_real_
    ),
    y_pos = max(ICR_mulch_perc_planti) * 1.05,
    .groups = "drop"
  )

# -------------------------------------------------
# 5️⃣ Plot
# -------------------------------------------------

ggplot(plot_data,
       aes(x = Termite_group,
           y = ICR_mulch_perc_planti,
           fill = Termite_group)) +
  
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.3) +
  
  geom_text(data = summary_stats,
            aes(y = mean_mulch,
                label = paste0("Mean=", round(mean_mulch,1),
                               "\nN=", N)),
            vjust = -1,
            size = 3) +
  
  geom_text(data = t_test_results,
            aes(x = 1.5,
                y = y_pos,
                label = paste0("p = ", signif(p_value, 3))),
            inherit.aes = FALSE,
            size = 4) +
  
  facet_wrap(~ IQR_Season) +
  
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  ) +
  
  labs(
    title = "Mulch Percentage at Planting by Termite Incidence",
    x = "Termite Incidence",
    y = "Mulch at Planting (%)"
  )

## Dive into the problematic weed “couchgrass; Digitaria abyssinica” This weed is very competitive and hard to erradicate by hand, so very complicated in CA systems. Current weed variable “Weed_species_A_combined” only divides into broad leaf, grasses and whitch weed. So now we will digg more into couch grass specifically

First lets explore when was this variable recorded NOtes: not well recorded in 23A

Extract the precence of couch grass in each block from ca_raw

library(dplyr)
library(stringr)

ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")
library(dplyr)

result <- ca_raw %>%
  # Keep only Treat_code C
  filter(Treat_code == "C") %>%
  
  # Detect Couch grass (any spelling variant)
  mutate(Couch_Presence = ifelse(
    str_detect(IQR_Weedtype1_specie,
               regex("Couch_grass|couchgrass|Couch grass", ignore_case = FALSE)),
    1, 0
  )) %>%
  
  # Group by block and season
  group_by(IQR_block, IQR_Season) %>%
  
  # If it appears at least once → 1
  summarise(Couch_Presence = max(Couch_Presence, na.rm = TRUE),
            .groups = "drop")

result
library(dplyr)

season_percentages <- result %>%
  group_by(IQR_Season) %>%
  summarise(
    pct_0     = mean(Couch_Presence == 0, na.rm = TRUE) * 100,
    pct_1     = mean(Couch_Presence == 1, na.rm = TRUE) * 100,
    pct_negInf = mean(Couch_Presence == -Inf, na.rm = TRUE) * 100,
    .groups = "drop"
  )

season_percentages
library(dplyr)
library(tidyr)

couch_block_table <- result %>%
  
  # Keep only relevant seasons
  filter(IQR_Season %in% c("23A", "23B", "24A", "25A", "25B")) %>%
  
  # Pivot to wide format
  pivot_wider(
    names_from = IQR_Season,
    values_from = Couch_Presence,
    names_prefix = "Couch_"
  ) %>%
  
  # Replace missing with 0 (if a block had no observation that season)
  mutate(across(starts_with("Couch_"), ~replace_na(., 0))) %>%
  
  # Create consistency variable
  mutate(
    Couch_Consit = ifelse(
      rowSums(across(starts_with("Couch_"))) >= 3,
      1, 0
    )
  )

couch_block_table
library(dplyr)

percent_ones <- couch_block_table %>%
  summarise(
    across(
      -IQR_block,
      ~ mean(. == 1, na.rm = TRUE) * 100
    )
  )

percent_ones

WE now paste the variable couch grass in at leas 3 seasons in CONV_CA

library(dplyr)

CONV_CA_Ratio <- CONV_CA_Ratio %>%
  left_join(
    couch_block_table %>%
      dplyr::select(IQR_block, Couch_Consit),
    by = "IQR_block"
  )

Now we evaluate the relation among having couch grass (at least 2 seasons to make it more robust) with the overall average Y_ratio of a

library(dplyr)
library(ggplot2)
library(ggpubr)

plot_data <- CONV_CA_Ratio %>%
  mutate(Couch_Consit = factor(Couch_Consit, levels = c(0, 1)))

# per-facet positions
y_positions <- plot_data %>%
  group_by(IQF_environment) %>%
  summarise(
    y_max  = max(Y_ratio, na.rm = TRUE),
    y_label = y_max * 0.8,
    y_pval  = y_max * 0.6,
    .groups = "drop"
  )

# summary labels (N, Mean, %>=1)
summary_labels <- plot_data %>%
  group_by(IQF_environment, Couch_Consit) %>%
  summarise(
    N = n(),
    Mean = mean(Y_ratio, na.rm = TRUE),
    Pct_ge1 = mean(Y_ratio >= 1, na.rm = TRUE) * 100,
    label = paste0(
      "N = ", N,
      "\nMean = ", round(Mean, 2),
      "\n≥1 = ", round(Pct_ge1, 1), "%"
    ),
    .groups = "drop"
  ) %>%
  left_join(y_positions, by = "IQF_environment")

# p-values per environment
pvals <- plot_data %>%
  group_by(IQF_environment) %>%
  summarise(p = tryCatch(t.test(Y_ratio ~ Couch_Consit)$p.value, error = function(e) NA_real_),
            .groups = "drop") %>%
  left_join(y_positions, by = "IQF_environment") %>%
  mutate(
    group1 = "0",
    group2 = "1",
    y.position = y_pval,
    p.signif = case_when(
      is.na(p) ~ "NA",
      p <= 0.001 ~ "***",
      p <= 0.01  ~ "**",
      p <= 0.05  ~ "*",
      TRUE ~ "ns"
    )
  )

ggplot(plot_data, aes(x = Couch_Consit, y = Y_ratio, fill = Couch_Consit)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_text(
    data = summary_labels,
    aes(x = Couch_Consit, y = y_label, label = label),
    inherit.aes = FALSE,
    size = 3.2
  ) +
  stat_pvalue_manual(
    pvals,
    label = "p.signif",
    xmin = "group1",
    xmax = "group2",
    y.position = "y.position",
    tip.length = 0.01
  ) +
  facet_wrap(~ IQF_environment, scales = "free_y") +
  labs(x = "Couch Consistency", y = "Y_ratio") +
  coord_cartesian(clip = "off") +
  theme_bw() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold"),
    plot.margin = ggplot2::margin(10, 10, 30, 10)
  )

## now lets add couch grass precence to drivers

library(dplyr)

drivers_data <- drivers_data %>%
  left_join(
    couch_block_table %>%
      dplyr::select(IQR_block, Couch_Consit),
    by = "IQR_block"
  )

Now will see the list of unique weeds to correct if there is errors

library(dplyr)
library(tidyr)

unique_weeds <- weeds_data %>%
  pivot_longer(
    cols = starts_with("Weed_"),
    names_to = "Weed_number",
    values_to = "Weed_name"
  ) %>%
  filter(!is.na(Weed_name)) %>%
  distinct(Weed_name) %>%
  arrange(Weed_name)

unique_weeds

Now we add a variable that counts the N of weeds species in the plot

library(dplyr)

weeds_data <- weeds_data %>%
  rowwise() %>%
  mutate(
    N_weeds = n_distinct(
      c_across(starts_with("Weed_"))[
        !c_across(starts_with("Weed_")) %in% c(NA, "Other", "--")
      ]
    )
  ) %>%
  ungroup()

Lets see how was the weeds N across season

library(ggplot2)
library(dplyr)

ggplot(weeds_data, aes(x = N_weeds)) +
  geom_bar(fill = "steelblue", color = "black") +
  facet_wrap(~ IQR_Season) +
  theme_bw() +
  labs(
    x = "Number of Unique Weeds (N_weeds)",
    y = "Frequency (Number of Plots)",
    title = "Frequency Distribution of Weed Richness by Season"
  )

### NOw we add two new variables that is the count of broad leaf and of grasses

library(dplyr)
library(tidyr)

weed_counts <- weeds_data %>%
  mutate(row_id = row_number()) %>%   # unique row ID
  pivot_longer(
    cols = starts_with("Weed_"),
    values_to = "Weed_name"
  ) %>%
  filter(
    !is.na(Weed_name),
    Weed_name != "Other",
    Weed_name != "--"
  ) %>%
  left_join(
    weeds %>% dplyr::select(Name_Trial, Category),
    by = c("Weed_name" = "Name_Trial")
  )

weed_summary <- weed_counts %>%
  group_by(row_id) %>%
  summarise(
    N_Broadleaf = sum(Category == "Broadleaf", na.rm = TRUE),
    N_Grass     = sum(Category == "Grass", na.rm = TRUE),
    .groups = "drop"
  )

weeds_data <- weeds_data %>%
  mutate(row_id = row_number()) %>%
  left_join(weed_summary, by = "row_id") %>%
  dplyr::select(-row_id)

Now we include one more variable that indicates if there is cynodon dactilon

library(dplyr)

weeds_data <- weeds_data %>%
  mutate(
    CouchGrass = as.integer(
      rowSums(across(starts_with("Weed_"), ~ . == "Couch_grass"), 
              na.rm = TRUE) > 0
    )
  )

Now we make a new variable that indicats weed pressure

1 if 1 N_weeds = 0 1.5 if N_Broadleaf = 1 & N_Grass = 0 2 if N_Broadleaf = >2 & N_Grass = 0 3 if N_Broadleaf = 0 & N_Grass = 1 & CouchGrass is = 0 3.5 if N_Broadleaf > 0 & N_Grass = 1 & CouchGrass is = 0 4 N_Grass > 1 & CouchGrass is = 0 6 if N_Broadleaf = 0 & N_Grass = 1 & CouchGrass is = 1 6.5 if N_Broadleaf > 0 & N_Grass = 1 & CouchGrass is = 1 8 if N_Grass > 1 & CouchGrass is = 1 ### Same but for which weed

library(dplyr)

weeds_data <- weeds_data %>%
  mutate(
    Whitch_weed = as.integer(
      rowSums(across(starts_with("Weed_"), ~ . == "Witchweed"), 
              na.rm = TRUE) > 0
    )
  )
library(dplyr)

weeds_data <- weeds_data %>%
  mutate(
    Weed_Overall = case_when(
      # 1) Sin malezas
      N_weeds == 0 ~ 1,

      # Solo broadleaf, sin grass
      N_Broadleaf == 1 & N_Grass == 0 ~ 1.5,
      N_Broadleaf >= 2 & N_Grass == 0 ~ 2,

      # Grass = 1, sin Couch grass
      N_Broadleaf == 0 & N_Grass == 1 & CouchGrass == 0 ~ 3,
      N_Broadleaf >  0 & N_Grass == 1 & CouchGrass == 0 ~ 3.5,

      # Grass > 1, sin Couch grass
      N_Grass > 1 & CouchGrass == 0 ~ 4,

      # Grass = 1, con Couch grass
      N_Broadleaf == 0 & N_Grass == 1 & CouchGrass == 1 ~ 6,
      N_Broadleaf >  0 & N_Grass == 1 & CouchGrass == 1 ~ 6.5,

      # Grass > 1, con Couch grass
      N_Grass > 1 & CouchGrass == 1 ~ 8,

      # Si algo no entra en ninguna regla (por ejemplo N_Grass==0 & N_Broadleaf==0 pero N_weeds>0)
      TRUE ~ NA_real_
    )
  )

Now we exclude 2023A and all the seasons there Weed_1 is NA

library(dplyr)

weeds_data <- weeds_data %>%
  filter(
    IQR_Season != "23A",
    !is.na(Weed_1)
  )

Now we make an average score for each IQR_block and we add to CONV_CA_Ratio

library(dplyr)

# 1) Average Weed_Overall per block (from weeds_data)
weed_overall_block <- weeds_data %>%
  group_by(IQR_block) %>%
  summarise(
    Weed_Overall_avg = mean(Weed_Overall, na.rm = TRUE),
    .groups = "drop"
  )

# 2) Paste into CONV_CA_Ratio (keeps only IQR_block in CONV_CA_Ratio)
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  left_join(weed_overall_block, by = "IQR_block")

now we see how it relates to Y_ratio performance

library(ggpubr)

ggplot(CONV_CA_Ratio,
       aes(x = Weed_Overall_avg, y = Y_ratio)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  stat_regline_equation() +
  stat_cor(method = "pearson") +
  facet_wrap(~ IQF_environment) +
  theme_bw()

now lets add the variable in drivers_data

library(dplyr)

drivers_data <- drivers_data %>%
  left_join(
    CONV_CA_Ratio %>%
      dplyr::select(IQR_block, Weed_Overall_avg) %>%
      distinct(IQR_block, .keep_all = TRUE),
    by = "IQR_block"
  )

Lets akso add the variable Whitch_weed into drivers data

if the weed is in al least on of the seasons we include it in the block

library(dplyr)

whitchweed_block <- weeds_data %>%
  group_by(IQR_block) %>%
  summarise(
    Whitch_weed = as.integer(any(Whitch_weed == 1, na.rm = TRUE)),
    .groups = "drop"
  )

whitchweed_block
library(dplyr)

drivers_data <- drivers_data %>%
  left_join(
    whitchweed_block %>%
      distinct(IQR_block, .keep_all = TRUE),
    by = "IQR_block"
  )

SH residues at harvest

library(dplyr)

block_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    total_blocks = n_distinct(IQR_block),
    
    blocks_with_sh_residues_gt0 = n_distinct(IQR_block[ICR_sh_residues_tn_ha > 0]),
    
    blocks_with_sh_residues_nonNA = n_distinct(IQR_block[!is.na(ICR_sh_residues_tn_ha)]),
    
    .groups = "drop"
  ) %>%
  mutate(
    perc_gt0 = 100 * blocks_with_sh_residues_gt0 / total_blocks,
    perc_nonNA = 100 * blocks_with_sh_residues_nonNA / total_blocks
  )

print(block_summary)
## # A tibble: 6 × 6
##   IQR_Season total_blocks blocks_with_sh_resid…¹ blocks_with_sh_resid…² perc_gt0
##   <chr>             <int>                  <int>                  <int>    <dbl>
## 1 23A                 538                      0                    538    0    
## 2 23B                 499                      1                      0    0.200
## 3 24A                 449                    303                    449   67.5  
## 4 24B                 463                    199                    463   43.0  
## 5 25A                 428                    278                    428   65.0  
## 6 25B                 441                      1                      0    0.227
## # ℹ abbreviated names: ¹​blocks_with_sh_residues_gt0,
## #   ²​blocks_with_sh_residues_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
sh_residue_summary <- CONV_CA_Ratio %>%
  group_by(IQR_Season) %>%
  summarise(
    count = sum(!is.na(ICR_sh_residues_tn_ha)),
    min = min(ICR_sh_residues_tn_ha, na.rm = TRUE),
    q25 = quantile(ICR_sh_residues_tn_ha, 0.25, na.rm = TRUE),
    median = median(ICR_sh_residues_tn_ha, na.rm = TRUE),
    mean = mean(ICR_sh_residues_tn_ha, na.rm = TRUE),
    q75 = quantile(ICR_sh_residues_tn_ha, 0.75, na.rm = TRUE),
    max = max(ICR_sh_residues_tn_ha, na.rm = TRUE),
    sd = sd(ICR_sh_residues_tn_ha, na.rm = TRUE)
  ) %>%
  arrange(IQR_Season)

print(sh_residue_summary)
## # A tibble: 6 × 9
##   IQR_Season count   min   q25 median    mean    q75    max    sd
##   <chr>      <int> <dbl> <dbl>  <dbl>   <dbl>  <dbl>  <dbl> <dbl>
## 1 23A          538     0     0   0      0      0        0    0   
## 2 23B            0   Inf    NA  NA    NaN     NA     -Inf   NA   
## 3 24A          449     0     0   3.21   4.57   7.23    33.1  5.21
## 4 24B          463     0     0   0      0.785  0.695   12    1.63
## 5 25A          428     0     0   1.01   3.96   4.61   120    9.07
## 6 25B            0   Inf    NA  NA    NaN     NA     -Inf   NA
library(ggplot2)

ggplot(CONV_CA_Ratio,
       aes(x = IQR_Season,
           y = ICR_sh_residues_tn_ha)) +
  geom_boxplot(outlier.color = "red", fill = "skyblue") +
  labs(
    title = "Distribution of SH Residues (t/ha) by Season",
    y = "ICR_sh_residues_tn_ha",
    x = "IQR_Season"
  ) +
  theme_minimal()

model <- lm(Y_ratio ~ ICR_sh_residues_tn_ha,
            data = CONV_CA_Ratio,
            na.action = na.omit)

summary(model)
## 
## Call:
## lm(formula = Y_ratio ~ ICR_sh_residues_tn_ha, data = CONV_CA_Ratio, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73520 -0.19663 -0.02359  0.15158  2.93672 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.8013764  0.0079983 100.193   <2e-16 ***
## ICR_sh_residues_tn_ha -0.0002543  0.0013606  -0.187    0.852    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3217 on 1876 degrees of freedom
##   (940 observations deleted due to missingness)
## Multiple R-squared:  1.862e-05,  Adjusted R-squared:  -0.0005144 
## F-statistic: 0.03494 on 1 and 1876 DF,  p-value: 0.8518
library(dplyr)
library(ggplot2)

data_filt <- CONV_CA_Ratio %>%
  filter(!IQR_Season %in% c("23A", "23B", "24B", "25B"))


ggplot(data_filt,
       aes(x = ICR_sh_residues_tn_ha,
           y = Y_ratio,
           color = crop)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Yield Ratio vs SH Residues (t/ha) by Crop (Year 23 removed)",
    x = "ICR_sh_residues_tn_ha",
    y = "Y_ratio",
    color = "Crop"
  ) +
  theme_minimal()

We add the variable IQF_agzone” to drivers to test if betetr than IQF_environment:

library(dplyr)

# --------------------------------------------------
# 1️⃣ Create unique block-level agzone table
# --------------------------------------------------

agzone_block <- CONV_CA_Ratio %>%
  filter(!is.na(IQR_block), !is.na(IQF_agzone)) %>%
  group_by(IQR_block) %>%
  summarise(
    IQF_agzone = first(IQF_agzone),
    .groups = "drop"
  )

# Optional check (recommended once)
# Verify there is only ONE agzone per block
check_agzone <- CONV_CA_Ratio %>%
  group_by(IQR_block) %>%
  summarise(n_agzone = n_distinct(IQF_agzone)) %>%
  filter(n_agzone > 1)

print(check_agzone)  # Should return 0 rows
## # A tibble: 21 × 2
##    IQR_block n_agzone
##    <chr>        <int>
##  1 102_10           2
##  2 110_10           2
##  3 115_11           2
##  4 116_10           2
##  5 116_3            2
##  6 132_6            2
##  7 133_1            2
##  8 16_10            2
##  9 22_10            2
## 10 31_10            2
## # ℹ 11 more rows
# --------------------------------------------------
# 2️⃣ Join into drivers_data
# --------------------------------------------------

drivers_data <- drivers_data %>%
  left_join(agzone_block, by = "IQR_block")

# --------------------------------------------------
# 3️⃣ Verify no duplication occurred
# --------------------------------------------------

cat("Rows in drivers_data:", nrow(drivers_data), "\n")
## Rows in drivers_data: 473
write.csv(drivers_data, "D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/drivers_data.csv", row.names = FALSE)

RF to ID drivers of CA performance

We used alternative reponse variables; 1) avg_Y_ratio: the yield ration of CA:CONV averaged across seasons for each plot (farmer). So each farm has one value. 2) Y_ratio_2425: like 1 but we exluded the year 2023. 4) CA_typology: this is the response variable that was created based on what we speculate could be an acceptable yield perfromance across the 6 seasos for CA: Acceptable: Average ≥ 0.95 but at least one season < 0.95 // Poor: Average between 0.85 and 0.95 // Very Poor: Average ≤ 0.85

Database:drivers_data Kept variables: Couch_Grass, ICR_Org_C_avg num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” avg_ICR_mulch_perc_planti,

RF description:

RF results: Effects on CA:CONV ration averaged along seasons. If we use all the variables we have a very good fit; % Var explained: 58. However, some of those variables would be difficult to measure or predict at scale: soil lab results (N, K, O, OC), planting dates and rains. So we remove those to see the capacity of the model and still provides high presition: % Var explained: 51. Using same list of variables on the last (removing soil test planting dates and rains) but expluding yields from 2023 A and B (protocols changes significantly after this year); % Var explained: 54.64. Looking only at maize, with the reduced list of variables: % Var explained: 44

Effects on Yield performance type (very poor, poor or acceptable) It is clear that we can effectively target plots with good chance of success of CA (81% success rate with th elimited list of “easy to measure drivers”). Sure, we only target 12% of the plots (that is about 25% of the farmers since farmers have 2.5 plots on average). We need to check the % of farmers with acceptable performance with other practices (ferlizer, seed, lime) and se how close we are. We can then regulate how restrictive we are to find a balance among Targeted plots vs. Success rate

Using all the variables: Total Test Cases: 130 Actual Acceptable (%): 22.31 % Farmers Targeted (%): 11.54 % Precision Acceptable (%): 86.67 % Recall Acceptable (%): 44.83 %

Using the easy to record ones: Total Test Cases: 130 Actual Acceptable (%): 22.31 % Farmers Targeted (%): 12.31 % Precision Acceptable (%): 81.25 % Recall Acceptable (%): 44.83 %

RF with outpot variable the average avg_Y_ratio on each block

# ---------------------------
# Load required packages
# ---------------------------
library(randomForest)
library(dplyr)

# ---------------------------
# Step 1: Select predictors and response
# ---------------------------
rf_data <- drivers_data %>%
  dplyr::select(
    ICR_N_perc_23A,
    Couch_Consit,
    Whitch_weed,
    #ICR_K_perc_23A,
    K_factor,
    ICF_Alt_avg,
    avg_Y_ratio,
    ICR_Org_C_avg,
    ICR_Avail_P_avg,
    #P_factor,
    ICR_arable_land_avg,
    ICR_rainfall_avg,
    Comp_amount_corr_quali,
    ICF_planting_date,
    Slope,
    #IQR_TF_tubura_client,
    #IQR_plot_fert_quality,
    IQR_soil_texture,
    #IQR_soil_color,
    #IQF_position_slope,
    #Weed_species_A_combined,
    #Weed_Overall_avg,
    avg_ICR_mulch_perc_planti,
    IQF_agzone,
    IQF_environment
  ) %>%
  mutate(across(where(is.character), as.factor))

# ---------------------------
# Step 2: Train-test split (85/15)
# ---------------------------
set.seed(42)
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = 0.9 * n)

train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]

# ---------------------------
# Step 3: Remove missing values from training set
# ---------------------------
train_data_clean <- train_data %>%
  filter(complete.cases(.))

# Check missing values
na_check <- colSums(is.na(train_data_clean))
print(na_check)
##            ICR_N_perc_23A              Couch_Consit               Whitch_weed 
##                         0                         0                         0 
##                  K_factor               ICF_Alt_avg               avg_Y_ratio 
##                         0                         0                         0 
##             ICR_Org_C_avg           ICR_Avail_P_avg       ICR_arable_land_avg 
##                         0                         0                         0 
##          ICR_rainfall_avg    Comp_amount_corr_quali         ICF_planting_date 
##                         0                         0                         0 
##                     Slope          IQR_soil_texture avg_ICR_mulch_perc_planti 
##                         0                         0                         0 
##                IQF_agzone           IQF_environment 
##                         0                         0
# ---------------------------
# Step 4: Fit Random Forest Model
# ---------------------------
set.seed(123)
rf_model <- randomForest(
  avg_Y_ratio ~ ., 
  data = train_data_clean,
  importance = TRUE,
  mtry = 6,
  ntree = 1000
)

# ---------------------------
# Step 5: Model Summary & Importance
# ---------------------------
print(rf_model)
## 
## Call:
##  randomForest(formula = avg_Y_ratio ~ ., data = train_data_clean,      importance = TRUE, mtry = 6, ntree = 1000) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 0.02915448
##                     % Var explained: 22.48
# Variable importance
randomForest::importance(rf_model)
##                             %IncMSE IncNodePurity
## ICR_N_perc_23A             5.478877    0.77136654
## Couch_Consit              13.877696    0.46494732
## Whitch_weed                3.761944    0.09099631
## K_factor                   8.515755    0.29039145
## ICF_Alt_avg               20.517271    1.40886559
## ICR_Org_C_avg             15.196437    1.14089425
## ICR_Avail_P_avg            9.326057    1.44706298
## ICR_arable_land_avg        8.138493    1.20997879
## ICR_rainfall_avg          31.073456    1.35473086
## Comp_amount_corr_quali    10.940014    1.34074224
## ICF_planting_date         15.314794    1.23020654
## Slope                      8.927881    0.59760106
## IQR_soil_texture           4.237955    0.21121077
## avg_ICR_mulch_perc_planti 11.372327    1.01379768
## IQF_agzone                37.099021    1.83227429
## IQF_environment           10.002677    0.26255174
# Plot importance (MSE Increase)
varImpPlot(rf_model, type = 1, main = "Variable Importance (MSE Increase)")

## RF with outpot variable the average Y_ratio_2425 on each block

# ---------------------------
# Load required packages
# ---------------------------
library(randomForest)
library(dplyr)

# ---------------------------
# Step 1: Select predictors and response
# ---------------------------
rf_data <- drivers_data %>%
  dplyr::select(
    #ICR_N_perc_23A,
    Couch_Consit,
    #ICR_K_perc_23A,
    K_factor,
    ICF_Alt_avg,
    Y_ratio_2425,
    ICR_Org_C_avg,
    ICR_Avail_P_avg,
    #P_factor,
    ICR_arable_land_avg,
    ICR_rainfall_avg,
    Comp_amount_corr_quali,
    ICF_planting_date,
    Slope,
    #slope_cat.y,
    #IQR_TF_tubura_client,
    #IQR_plot_fert_quality,
    IQR_soil_texture,
    #IQR_soil_color,
    #IQF_position_slope,
    #Weed_species_A_combined,
    avg_ICR_mulch_perc_planti,
    IQF_agzone,
    IQF_environment
  ) %>%
  mutate(across(where(is.character), as.factor))

# ---------------------------
# Step 2: Train-test split (85/15)
# ---------------------------
set.seed(42)
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = 0.9 * n)

train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]

# ---------------------------
# Step 3: Remove missing values from training set
# ---------------------------
train_data_clean <- train_data %>%
  filter(complete.cases(.))

# Check missing values
na_check <- colSums(is.na(train_data_clean))
print(na_check)
##              Couch_Consit                  K_factor               ICF_Alt_avg 
##                         0                         0                         0 
##              Y_ratio_2425             ICR_Org_C_avg           ICR_Avail_P_avg 
##                         0                         0                         0 
##       ICR_arable_land_avg          ICR_rainfall_avg    Comp_amount_corr_quali 
##                         0                         0                         0 
##         ICF_planting_date                     Slope          IQR_soil_texture 
##                         0                         0                         0 
## avg_ICR_mulch_perc_planti                IQF_agzone           IQF_environment 
##                         0                         0                         0
# ---------------------------
# Step 4: Fit Random Forest Model
# ---------------------------
set.seed(123)
rf_model <- randomForest(
  Y_ratio_2425 ~ ., 
  data = train_data_clean,
  importance = TRUE,
  ntree = 1000
)

# ---------------------------
# Step 5: Model Summary & Importance
# ---------------------------
print(rf_model)
## 
## Call:
##  randomForest(formula = Y_ratio_2425 ~ ., data = train_data_clean,      importance = TRUE, ntree = 1000) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.03994959
##                     % Var explained: 25.73
# Variable importance
randomForest::importance(rf_model)
##                             %IncMSE IncNodePurity
## Couch_Consit               9.511177     0.4126721
## K_factor                   7.867600     0.4409009
## ICF_Alt_avg               19.688306     2.0427041
## ICR_Org_C_avg             13.575540     1.5316198
## ICR_Avail_P_avg            7.718358     2.4426599
## ICR_arable_land_avg       12.364430     2.0911293
## ICR_rainfall_avg          22.983407     1.4899004
## Comp_amount_corr_quali     8.700705     1.8078609
## ICF_planting_date         14.269789     1.8932166
## Slope                     12.765729     1.2326012
## IQR_soil_texture           5.821194     0.3596684
## avg_ICR_mulch_perc_planti 13.679202     1.6956865
## IQF_agzone                31.513485     2.6277166
## IQF_environment           12.478549     0.5847764
# Plot importance (MSE Increase)
varImpPlot(rf_model, type = 1, main = "Variable Importance (MSE Increase)")

RF with the output variable being CA_Typology

Modulate sensitivity and presition

library(dplyr)
library(randomForest)
library(ggplot2)

# ======================
# CONTROL PANEL
# ======================

train_frac <- 0.85          # training fraction (try 0.7–0.9)
acceptable_weight <- 1    # try: 1, 2, 3, 5
accept_threshold <- 0.7   # try: 0.5–0.8
set.seed(42)

# ======================
# Step 1: Prepare RF dataset
# ======================

rf_data <- drivers_data %>%
  dplyr::select(
    #ICR_N_perc_23A,
    #ICR_K_perc_23A,
    #K_factor,
    Couch_Consit,
    #ICF_Alt_avg,
    CA_typology,
    #ICR_Org_C_avg,
    #ICR_Avail_P_avg,
    #P_factor,
    #ICR_arable_land_avg,
    #ICR_rainfall_avg,
    Comp_amount_corr_quali,
    ICF_planting_date,
    Slope,
    #IQR_TF_tubura_client,
    #IQR_plot_fert_quality,
    IQR_soil_texture,
    IQR_soil_color,
    IQF_position_slope,
    #Weed_species_A_combined,
    #Weed_Overall_avg,
    avg_ICR_mulch_perc_planti,
    IQF_agzone,
    #IQF_environment
  ) %>%
  mutate(
    across(where(is.character), as.factor),

    # Combine Good + Acceptable
    CA_typology = as.character(CA_typology),
    CA_typology = ifelse(CA_typology == "Good", "Acceptable", CA_typology),
    CA_typology = factor(CA_typology,
                         levels = c("Very Poor", "Poor", "Acceptable"))
  )

cat("Total RF rows:", nrow(rf_data), "\n")
## Total RF rows: 473
# ======================
# Step 2: Train / Test split
# ======================

n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = train_frac * n)

train_data <- rf_data[train_indices, ]
test_data  <- rf_data[-train_indices, ]

# ======================
# Step 3: Remove rows with NA
# ======================

train_data_clean <- train_data %>% filter(complete.cases(.))
test_data_clean  <- test_data  %>% filter(complete.cases(.))

cat("Training cases:", nrow(train_data_clean), "\n")
## Training cases: 400
cat("Test cases:", nrow(test_data_clean), "\n")
## Test cases: 70
# ======================
# Step 4: Random Forest (TRAIN ONLY)
# ======================

class_weights <- c(
  "Very Poor" = 1,
  "Poor" = 1,
  "Acceptable" = acceptable_weight
)

set.seed(123)
rf_model <- randomForest(
  CA_typology ~ .,
  data = train_data_clean,
  ntree = 1000,
  importance = TRUE,
  classwt = class_weights
)

print(rf_model)
## 
## Call:
##  randomForest(formula = CA_typology ~ ., data = train_data_clean,      ntree = 1000, importance = TRUE, classwt = class_weights) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 44.75%
## Confusion matrix:
##            Very Poor Poor Acceptable class.error
## Very Poor        201   18         13   0.1336207
## Poor              66    6         12   0.9285714
## Acceptable        57   13         14   0.8333333
# Variable importance
varImpPlot(rf_model, type = 1, main = "Variable Importance (MDA)")

# ======================
# Step 5: Predict on INDEPENDENT TEST SET (probabilities)
# ======================

prob_test <- predict(rf_model, newdata = test_data_clean, type = "prob")

pred_test <- ifelse(
  prob_test[, "Acceptable"] > accept_threshold,
  "Acceptable",
  colnames(prob_test)[apply(prob_test, 1, which.max)]
)

pred_test <- factor(pred_test, levels = levels(test_data_clean$CA_typology))

conf_mat <- table(
  Actual = test_data_clean$CA_typology,
  Predicted = pred_test
)

print(conf_mat)
##             Predicted
## Actual       Very Poor Poor Acceptable
##   Very Poor         35    7          2
##   Poor              10    4          0
##   Acceptable         8    2          2
# ======================
# Step 6: Metrics (TEST DATA)
# ======================

total_cases <- sum(conf_mat)

actual_acceptable <- sum(conf_mat["Acceptable", ])
predicted_acceptable <- sum(conf_mat[, "Acceptable"])
true_acceptable <- conf_mat["Acceptable", "Acceptable"]

perc_actual_acceptable <- actual_acceptable / total_cases * 100
perc_farmers_targeted <- predicted_acceptable / total_cases * 100
precision_acceptable <- true_acceptable / predicted_acceptable * 100
recall_acceptable <- true_acceptable / actual_acceptable * 100

# ======================
# Variable Importance Report
# ======================


importance_mat <- randomForest::importance(rf_model, type = 1)

importance_df <- as.data.frame(importance_mat)
importance_df$Variable <- rownames(importance_df)

importance_df <- importance_df %>%
  arrange(desc(MeanDecreaseAccuracy))

print(importance_df)
##                           MeanDecreaseAccuracy                  Variable
## IQF_agzone                           23.867107                IQF_agzone
## Comp_amount_corr_quali               11.904357    Comp_amount_corr_quali
## Couch_Consit                         11.091370              Couch_Consit
## IQR_soil_color                       10.269540            IQR_soil_color
## avg_ICR_mulch_perc_planti            10.089181 avg_ICR_mulch_perc_planti
## Slope                                 7.888358                     Slope
## IQR_soil_texture                      7.542038          IQR_soil_texture
## IQF_position_slope                    7.434393        IQF_position_slope
## ICF_planting_date                     6.505443         ICF_planting_date
# ======================
# Per-class Variable Importance Report
# ======================

imp_full <- as.data.frame(randomForest::importance(rf_model))
imp_full$Variable <- rownames(imp_full)

# Reorder columns for readability
imp_full <- imp_full %>%
  dplyr::select(Variable, everything())

print(imp_full)
##                                            Variable Very Poor       Poor
## Couch_Consit                           Couch_Consit 12.791466 -1.2491152
## Comp_amount_corr_quali       Comp_amount_corr_quali 14.155458 -4.1272110
## ICF_planting_date                 ICF_planting_date  4.663397  0.4991434
## Slope                                         Slope  7.517825 -0.8686574
## IQR_soil_texture                   IQR_soil_texture  5.085424  0.9724830
## IQR_soil_color                       IQR_soil_color  9.357470  3.4593709
## IQF_position_slope               IQF_position_slope  6.987894 -1.5381527
## avg_ICR_mulch_perc_planti avg_ICR_mulch_perc_planti 13.961329 -3.5958032
## IQF_agzone                               IQF_agzone 20.960818  6.9210858
##                           Acceptable MeanDecreaseAccuracy MeanDecreaseGini
## Couch_Consit                2.712832            11.091370         8.583545
## Comp_amount_corr_quali      5.976400            11.904357        51.272968
## ICF_planting_date           5.474326             6.505443        47.399878
## Slope                       5.248742             7.888358        30.511996
## IQR_soil_texture            6.383877             7.542038        10.119882
## IQR_soil_color              3.386498            10.269540        14.758805
## IQF_position_slope          5.680219             7.434393        21.607852
## avg_ICR_mulch_perc_planti  -1.110002            10.089181        44.824044
## IQF_agzone                  8.517459            23.867107        34.404995
# Optional: export
# write.csv(importance_df, "variable_importance.csv", row.names = FALSE)




# ======================
# Step 7: Print summary
# ======================

cat("\n================ TEST SET MODEL SUMMARY ================\n")
## 
## ================ TEST SET MODEL SUMMARY ================
cat("Train fraction:", train_frac, "\n")
## Train fraction: 0.85
cat("Acceptable weight:", acceptable_weight, "\n")
## Acceptable weight: 1
cat("Acceptable threshold:", accept_threshold, "\n\n")
## Acceptable threshold: 0.7
cat("Total Test Cases:", total_cases, "\n")
## Total Test Cases: 70
cat("Actual Acceptable (%):", round(perc_actual_acceptable, 2), "%\n")
## Actual Acceptable (%): 17.14 %
cat("Farmers Targeted (%):", round(perc_farmers_targeted, 2), "%\n")
## Farmers Targeted (%): 5.71 %
cat("Precision Acceptable (%):", round(precision_acceptable, 2), "%\n")
## Precision Acceptable (%): 50 %
cat("Recall Acceptable (%):", round(recall_acceptable, 2), "%\n")
## Recall Acceptable (%): 16.67 %
cat("======================================================\n")
## ======================================================
library(randomForest)

# Extract tree number 1
tree_1 <- getTree(rf_model, k = 2, labelVar = TRUE)

head(tree_1)
library(rpart)
library(rpart.plot)

tree_model <- rpart(
  CA_typology ~ .,
  data = train_data_clean,
  method = "class"
)

rpart.plot(tree_model)

Same buyt with Typology_09

library(dplyr)
library(randomForest)
library(ggplot2)

# ======================
# CONTROL PANEL
# ======================

train_frac <- 0.85          # training fraction (try 0.7–0.9)
acceptable_weight <- 0.7    # try: 1, 2, 3, 5
accept_threshold <- 0.7   # try: 0.5–0.8
set.seed(42)

# ======================
# Step 1: Prepare RF dataset
# ======================

rf_data <- drivers_data %>%
  dplyr::select(
    #ICR_N_perc_23A,
    #ICR_K_perc_23A,
    #K_factor,
    #Couch_Grass,
    Couch_Consit,
    #Whitch_weed,
    #ICF_Alt_avg,
    CA_typology_09,
    #ICR_Org_C_avg,
    #ICR_Avail_P_avg,
    #P_factor,
    #ICR_arable_land_avg,
    #ICR_rainfall_avg,
    Comp_amount_corr_quali,
    ICF_planting_date,
    Slope,
    #slope_cat.y,
    #IQR_TF_tubura_client,
    #IQR_plot_fert_quality,
    IQR_soil_texture,
    IQR_soil_color,
    IQF_position_slope,
    #Weed_species_A_combined,
    #Weed_Overall_avg,
    avg_ICR_mulch_perc_planti,
    IQF_agzone,
    #IQF_environment
  ) %>%
  mutate(
    across(where(is.character), as.factor),

    # Combine Good + Acceptable
    CA_typology_09 = as.character(CA_typology_09),
    CA_typology_09 = ifelse(CA_typology_09 == "Good", "Acceptable", CA_typology_09),
    CA_typology_09 = factor(CA_typology_09,
                         levels = c("Very Poor", "Poor", "Acceptable"))
  )

cat("Total RF rows:", nrow(rf_data), "\n")
## Total RF rows: 473
# ======================
# Step 2: Train / Test split
# ======================

n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = train_frac * n)

train_data <- rf_data[train_indices, ]
test_data  <- rf_data[-train_indices, ]

# ======================
# Step 3: Remove rows with NA
# ======================

train_data_clean <- train_data %>% filter(complete.cases(.))
test_data_clean  <- test_data  %>% filter(complete.cases(.))

cat("Training cases:", nrow(train_data_clean), "\n")
## Training cases: 400
cat("Test cases:", nrow(test_data_clean), "\n")
## Test cases: 70
# ======================
# Step 4: Random Forest (TRAIN ONLY)
# ======================

class_weights <- c(
  "Very Poor" = 1,
  "Poor" = 1,
  "Acceptable" = acceptable_weight
)

set.seed(123)
rf_model <- randomForest(
  CA_typology_09 ~ .,
  data = train_data_clean,
  ntree = 1000,
  importance = TRUE,
  classwt = class_weights
)

print(rf_model)
## 
## Call:
##  randomForest(formula = CA_typology_09 ~ ., data = train_data_clean,      ntree = 1000, importance = TRUE, classwt = class_weights) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 45.75%
## Confusion matrix:
##            Very Poor Poor Acceptable class.error
## Very Poor        159    7         33   0.2010050
## Poor              38   13         28   0.8354430
## Acceptable        61   16         45   0.6311475
# Variable importance
varImpPlot(rf_model, type = 1, main = "Variable Importance (MDA)")

# ======================
# Step 5: Predict on INDEPENDENT TEST SET (probabilities)
# ======================

prob_test <- predict(rf_model, newdata = test_data_clean, type = "prob")

pred_test <- ifelse(
  prob_test[, "Acceptable"] > accept_threshold,
  "Acceptable",
  colnames(prob_test)[apply(prob_test, 1, which.max)]
)

pred_test <- factor(pred_test, levels = levels(test_data_clean$CA_typology_09))

conf_mat <- table(
  Actual = test_data_clean$CA_typology_09,
  Predicted = pred_test
)

print(conf_mat)
##             Predicted
## Actual       Very Poor Poor Acceptable
##   Very Poor         28    3          5
##   Poor               8    4          2
##   Acceptable         9    4          7
# ======================
# Step 6: Metrics (TEST DATA)
# ======================

total_cases <- sum(conf_mat)

actual_acceptable <- sum(conf_mat["Acceptable", ])
predicted_acceptable <- sum(conf_mat[, "Acceptable"])
true_acceptable <- conf_mat["Acceptable", "Acceptable"]

perc_actual_acceptable <- actual_acceptable / total_cases * 100
perc_farmers_targeted <- predicted_acceptable / total_cases * 100
precision_acceptable <- true_acceptable / predicted_acceptable * 100
recall_acceptable <- true_acceptable / actual_acceptable * 100

# ======================
# Variable Importance Report
# ======================


importance_mat <- randomForest::importance(rf_model, type = 1)

importance_df <- as.data.frame(importance_mat)
importance_df$Variable <- rownames(importance_df)

importance_df <- importance_df %>%
  arrange(desc(MeanDecreaseAccuracy))

print(importance_df)
##                           MeanDecreaseAccuracy                  Variable
## IQF_agzone                           27.268516                IQF_agzone
## Comp_amount_corr_quali               14.786359    Comp_amount_corr_quali
## ICF_planting_date                    12.593533         ICF_planting_date
## Couch_Consit                         11.781759              Couch_Consit
## avg_ICR_mulch_perc_planti            10.725758 avg_ICR_mulch_perc_planti
## IQF_position_slope                   10.198440        IQF_position_slope
## Slope                                 9.174468                     Slope
## IQR_soil_color                        6.316761            IQR_soil_color
## IQR_soil_texture                      3.402108          IQR_soil_texture
# ======================
# Per-class Variable Importance Report
# ======================

imp_full <- as.data.frame(randomForest::importance(rf_model))
imp_full$Variable <- rownames(imp_full)

# Reorder columns for readability
imp_full <- imp_full %>%
  dplyr::select(Variable, everything())

print(imp_full)
##                                            Variable Very Poor      Poor
## Couch_Consit                           Couch_Consit 13.838714 -3.284431
## Comp_amount_corr_quali       Comp_amount_corr_quali 18.523836 -1.123834
## ICF_planting_date                 ICF_planting_date  9.651886  6.674355
## Slope                                         Slope  7.073606  4.898445
## IQR_soil_texture                   IQR_soil_texture  2.562901 -3.602057
## IQR_soil_color                       IQR_soil_color  4.549651  2.037030
## IQF_position_slope               IQF_position_slope  5.843769  3.963753
## avg_ICR_mulch_perc_planti avg_ICR_mulch_perc_planti 14.740756  4.611885
## IQF_agzone                               IQF_agzone 25.731796 10.768075
##                           Acceptable MeanDecreaseAccuracy MeanDecreaseGini
## Couch_Consit                4.314735            11.781759         8.688993
## Comp_amount_corr_quali      2.484765            14.786359        51.234172
## ICF_planting_date           5.872505            12.593533        46.299473
## Slope                       3.494318             9.174468        30.118431
## IQR_soil_texture            4.886926             3.402108         8.544853
## IQR_soil_color              3.786981             6.316761        13.800649
## IQF_position_slope          7.618268            10.198440        21.184165
## avg_ICR_mulch_perc_planti  -4.427283            10.725758        44.354002
## IQF_agzone                  4.433662            27.268516        36.120029
# Optional: export
# write.csv(importance_df, "variable_importance.csv", row.names = FALSE)




# ======================
# Step 7: Print summary
# ======================

cat("\n================ TEST SET MODEL SUMMARY ================\n")
## 
## ================ TEST SET MODEL SUMMARY ================
cat("Train fraction:", train_frac, "\n")
## Train fraction: 0.85
cat("Acceptable weight:", acceptable_weight, "\n")
## Acceptable weight: 0.7
cat("Acceptable threshold:", accept_threshold, "\n\n")
## Acceptable threshold: 0.7
cat("Total Test Cases:", total_cases, "\n")
## Total Test Cases: 70
cat("Actual Acceptable (%):", round(perc_actual_acceptable, 2), "%\n")
## Actual Acceptable (%): 28.57 %
cat("Farmers Targeted (%):", round(perc_farmers_targeted, 2), "%\n")
## Farmers Targeted (%): 20 %
cat("Precision Acceptable (%):", round(precision_acceptable, 2), "%\n")
## Precision Acceptable (%): 50 %
cat("Recall Acceptable (%):", round(recall_acceptable, 2), "%\n")
## Recall Acceptable (%): 35 %
cat("======================================================\n")
## ======================================================
library(randomForest)

# Extract tree number 1
tree_1 <- getTree(rf_model, k = 2, labelVar = TRUE)

head(tree_1)
library(rpart)
library(rpart.plot)

tree_model <- rpart(
  CA_typology_09 ~ .,
  data = train_data_clean,
  method = "class"
)

rpart.plot(tree_model)

jpeg("D:/Mes Donnees/OAF_CIRAD/CA_MoU/figures/regression_tree.jpg",
     width = 2000,
     height = 1500,
     res = 300)

rpart.plot(tree_model)

dev.off()
## png 
##   2

ID of drivers trough comparing top with poor performers

We will compare the farms on the top of performance (30% best average CA:CONV across the 6 seasons) with the lowest performer (30% bottom) for the different variables. Specially for the variables that have more clear impact: agzone mulch compost planting date Soil texture Rain