#Libraries + the raw database that comes directly from the google-sheet shared by OAF (https://docs.google.com/spreadsheets/d/1ycoY3k0vQIEYVtDBqDFWpYZF_p4sMNF59y8QM6PkdGE/edit?gid=1553488959#gid=1553488959). You can also find description of the variables here(https://docs.google.com/spreadsheets/d/1ycoY3k0vQIEYVtDBqDFWpYZF_p4sMNF59y8QM6PkdGE/edit?gid=366994340#gid=366994340)

Data Cleaning; elimination some outliers for yield

I use a linear mixed model to identify outliers in yield (DC_Yield_t_ha), flagged by large residuals or influence diagnostics (e.g. standardized residuals, Cook’s distance, leverage), and then count how many outliers exist per season and treatment code.

Notes: I used standard residue of >4 to consider outlier (its high but on farm across the country and several seasons we should be more flexible).

# Data Cleaning; elimination some outliers for yield

# Load required libraries
library(lme4)
library(lmerTest)
library(broom.mixed)
library(dplyr)

# ---- 1. Prepare data ----
model_data <- ca_raw %>%
  filter(
    !is.na(DC_Yield_t_ha),
    !is.na(IQR_block),
    !is.na(IQF_environment),
    !is.na(Treat_code),
    !is.na(IQR_Season)
  ) %>%
  mutate(
    DC_Yield_t_ha = as.numeric(DC_Yield_t_ha),
    Treat_code = as.factor(Treat_code),
    IQF_environment = as.factor(IQF_environment),
    IQR_Season = as.factor(IQR_Season),
    IQR_block = as.factor(IQR_block)
  )

# ---- 2. Fit linear mixed model ----
model_resid <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQR_Season + 
    (1 | IQF_environment) + (1 | IQR_block),
  data = model_data
)

# ---- 3. Get residuals and flag outliers ----
resid_data <- augment(model_resid) %>%
  mutate(
    std_resid = .resid / sd(.resid, na.rm = TRUE),
    is_outlier = abs(std_resid) > 4
  )

# 1. Compute residuals with augment() and join to model data
resid_data <- bind_cols(model_data, augment(model_resid))

# 2. Compute residual metrics
# ---- 3. Extract residuals (correctly aligned) ----
resid_data <- augment(model_resid, data = model_data) %>%
  mutate(
    std_resid = scale(.resid)[, 1],
    is_outlier = abs(std_resid) > 4
  )

# ---- 4. Count outliers per group ----
outlier_counts <- resid_data %>%
  filter(is_outlier) %>%
  group_by(IQR_Season, IQF_environment, Treat_code) %>%
  summarise(n_outliers = n(), .groups = "drop")


# ---- 6. Print results ----
print(outlier_counts)
## # A tibble: 13 × 4
##    IQR_Season IQF_environment Treat_code n_outliers
##    <fct>      <fct>           <fct>           <int>
##  1 23A        Mid Low Dry     A                   3
##  2 23A        Mid Low Dry     B                   1
##  3 23A        Mid Low Dry     C                   2
##  4 23A        Mid Low Dry     D                   1
##  5 24A        Highland        A                   5
##  6 24A        Highland        B                   4
##  7 24A        Highland        C                   4
##  8 24A        Highland        D                   4
##  9 24A        Mid_Low_Wet     B                   1
## 10 24A        Mid_Low_Wet     D                   1
## 11 24A        Transition      A                   1
## 12 25A        Highland        A                   2
## 13 25A        Mid_Low_Wet     A                   1
# ---- 7. Total number of outliers ----
cat("Total outliers detected:", sum(resid_data$is_outlier), "\n")
## Total outliers detected: 30

Since we are pulling together CAs treatments lated, I eliminated outliers for CA but I kept the blocks, assuming the average of the 2 left will not ve very affected. However, where I wliminate treatment A (the conventional), then I eliminate the block for yield analysis: total of 12 in the total 6 seasons = 6/2000 and something

library(dplyr)

# 1. Create a unique identifier for Season + Block
resid_data <- resid_data %>%
  mutate(IQF_SeasonblockID = paste(IQR_block, IQR_Season, sep = "_"))


# 2. Identify Season-Block IDs where A was an outlier
remove_full_blocks <- resid_data %>%
  filter(is_outlier == TRUE, Treat_code == "A") %>%
  pull(IQF_SeasonblockID) %>%
  unique()

# 3. Identify rows to remove for B, C, D (but not whole blocks)
remove_specific_rows <- resid_data %>%
  filter(is_outlier == TRUE, Treat_code %in% c("B", "C", "D")) %>%
  dplyr::select(IQR_Season, IQR_block, Treat_code)


# 4. Start from ca_raw and remove:
#    - Full blocks where A was outlier
#    - Specific rows where B/C/D were outlier
yield_n_out <- ca_raw %>%
  # Remove blocks where A was an outlier
  filter(!(IQF_SeasonblockID %in% remove_full_blocks)) %>%
  # Remove specific rows (B, C, D outliers)
  anti_join(remove_specific_rows, by = c("IQR_Season", "IQR_block", "Treat_code")) %>%
  dplyr::select(-IQF_SeasonblockID)  # remove temp column

Which environmental zoning should we use: AEZ-OAF or the conventional Rwandan AEZ?

To compare zoning systems, we evaluated how much variance in yield is explained when modeling the effects of season, zone, and treatment, using either OAF_AEZ or the national AEZ classification. The zoning system that accounts for more variance provides the better environmental stratification for yield analysis.

Notes: We use OAF_AEZ because (now on called IQF_environment): 1- it better represents the biophysical and agronomic conditions that drive yield variation (e.g., moisture regime, elevation, temperature). 2- In our models, OAF_AEZ explains substantially more variance than the national AEZ classification (η² = 0.49 vs. 0.21). Moreover since it has 4 cathergories (AEZ has 8) the power to stufy effect of environmental zone and interactions will give us more power. 3 -OAF_AEZ is also closely aligned with crop-specific agronomic realities, including maize variety zones (highland vs. mid-altitude) and bean growth habit (bush vs. climbing). 4- While national AEZs may capture broader social, infrastructural, and historical patterns due to their geographic contiguity, we will analyze these socio-spatial drivers separately when modeling household-level effects on yield

#Fix mistake in DC_Yield_t_ha, were some Highland are spelled Higland
# Replace misspelling in IQF_environment
# Fix mistakes in IQF_environment
yield_n_out <- yield_n_out %>%
  mutate(
    IQF_environment = gsub("^<", "", IQF_environment),
    IQF_environment = case_when(
      tolower(IQF_environment) == "higland"   ~ "Highland",
      tolower(IQF_environment) == "highland"  ~ "Highland",
      IQF_environment == "mid_low_wet"        ~ "Mid_Low_Wet",
      IQF_environment == "mid_low_dry"        ~ "Mid_Low_Dry",
      IQF_environment == "Mid Low Dry"        ~ "Mid_Low_Dry",
      IQF_environment == "Mid Low Wet"        ~ "Mid_Low_Wet",
      IQF_environment == "transition"         ~ "Transition",
      TRUE                                    ~ IQF_environment
    )
  )

# Load libraries
library(lme4)
library(lmerTest)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:mvtnorm':
## 
##     standardize
library(performance)

# Clean dataset
yield_n_out_clean <- yield_n_out %>%
  filter(!is.na(DC_Yield_t_ha), !is.na(Treat_code))

# Fit models
mod_agzone <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQF_agzone + IQR_Season + (1 | IQR_block),
  data = yield_n_out_clean
)

mod_env <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQF_environment + IQR_Season + (1 | IQR_block),
  data = yield_n_out_clean
)

# Type III ANOVA
anova_agzone <- Anova(mod_agzone, type = 3)
anova_env    <- Anova(mod_env,    type = 3)

# Partial Eta Squared (correct for current effectsize)
eta_agzone <- eta_squared(mod_agzone)
eta_env    <- eta_squared(mod_env)


# R² measures
r2_agzone <- r2(mod_agzone)
r2_env    <- r2(mod_env)

# Print results
cat("=== ANOVA: AgZone Model ===\n")
## === ANOVA: AgZone Model ===
print(anova_agzone)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: DC_Yield_t_ha
##                          Chisq Df Pr(>Chisq)    
## (Intercept)            1358.72  1  < 2.2e-16 ***
## Treat_code              104.75  3  < 2.2e-16 ***
## IQF_agzone              188.60 10  < 2.2e-16 ***
## IQR_Season            16996.08  5  < 2.2e-16 ***
## Treat_code:IQF_agzone   112.50 30  1.791e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== ANOVA: Environment Model ===\n")
## 
## === ANOVA: Environment Model ===
print(anova_env)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: DC_Yield_t_ha
##                                Chisq Df Pr(>Chisq)    
## (Intercept)                 3542.038  1  < 2.2e-16 ***
## Treat_code                   329.467  3  < 2.2e-16 ***
## IQF_environment              238.960  3  < 2.2e-16 ***
## IQR_Season                 17421.749  5  < 2.2e-16 ***
## Treat_code:IQF_environment    28.358  9  0.0008308 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== Partial Eta²: AgZone Model ===\n")
## 
## === Partial Eta²: AgZone Model ===
print(eta_agzone)
## # Effect Size for ANOVA (Type III)
## 
## Parameter             | Eta2 (partial) |       95% CI
## -----------------------------------------------------
## Treat_code            |       2.81e-03 | [0.00, 1.00]
## IQF_agzone            |           0.19 | [0.14, 1.00]
## IQR_Season            |           0.61 | [0.61, 1.00]
## Treat_code:IQF_agzone |           0.01 | [0.01, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
cat("\n=== Partial Eta²: Environment Model ===\n")
## 
## === Partial Eta²: Environment Model ===
print(eta_env)
## # Effect Size for ANOVA (Type III)
## 
## Parameter                  | Eta2 (partial) |       95% CI
## ----------------------------------------------------------
## Treat_code                 |           0.07 | [0.06, 1.00]
## IQF_environment            |           0.10 | [0.08, 1.00]
## IQR_Season                 |           0.62 | [0.61, 1.00]
## Treat_code:IQF_environment |       2.68e-03 | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
cat("\n=== R²: AgZone Model ===\n")
## 
## === R²: AgZone Model ===
print(r2_agzone)
## # R2 for Mixed Models
## 
##   Conditional R2: 0.693
##      Marginal R2: 0.541
cat("\n=== R²: Environment Model ===\n")
## 
## === R²: Environment Model ===
print(r2_env)
## # R2 for Mixed Models
## 
##   Conditional R2: 0.725
##      Marginal R2: 0.505

Testing the three-way interaction

To assess whether the interaction among Treatment, Environment (OAF_AEZ), and Season improves the model, we compared a full model (including the 3-way interaction) with a reduced model (no 3-way term). Because lmer() does not provide p-values for interaction terms, we used a likelihood ratio test via anova() to compare the nested models.

Notes: The likelihood ratio test shows strong evidence that the three-way interaction (Treatment × Environment × Season) significantly improves model fit. Therefore, we proceed to open the interaction and evaluate 2 way interactions (Environmetn x treatment)

Models

Full model (with 3-way interaction):

model_full <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQF_environment * IQR_Season +
    (1 | IQR_block) + (1 | IQF_environment) + (1 | IQR_Season),
  data = yield_n_out,
  REML = FALSE
)
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
## boundary (singular) fit: see help('isSingular')

Reduced model (no 3-way interaction):

model_no_3way <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQF_environment + Treat_code * IQR_Season +
    (1 | IQR_block) + (1 | IQF_environment) + (1 | IQR_Season),
  data = yield_n_out,
  REML = FALSE
)
## boundary (singular) fit: see help('isSingular')

Likelihood ratio test

anova(model_no_3way, model_full)
## Data: yield_n_out
## Models:
## model_no_3way: DC_Yield_t_ha ~ Treat_code * IQF_environment + Treat_code * IQR_Season + (1 | IQR_block) + (1 | IQF_environment) + (1 | IQR_Season)
## model_full: DC_Yield_t_ha ~ Treat_code * IQF_environment * IQR_Season + (1 | IQR_block) + (1 | IQF_environment) + (1 | IQR_Season)
##               npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## model_no_3way   40 36253 36546 -18086    36173                         
## model_full      98 35246 35964 -17525    35050 1122.4 58  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Test the AEZ x Treatment interaction, in each season

Now that we confirmed a significant 3-way interaction, next step is to analyze each season separately and test for Treat_code × IQF_environment interaction within each season.

Notes: Interaction is significant in all seasons but one season (25B). Then continue with post-hoc comparisons per agroecological zone (IQF_environment) within each season.

## 
## 
## ==== SEASON: 24A ====
## Type III Analysis of Variance Table with Satterthwaite's method
##                            Sum Sq Mean Sq NumDF   DenDF  F value    Pr(>F)    
## Treat_code                 499.92 166.639     3 1384.52 259.4289 < 2.2e-16 ***
## IQF_environment             55.04  18.347     3  598.19  28.5635 < 2.2e-16 ***
## Treat_code:IQF_environment  48.48   5.387     9 1384.37   8.3864 2.883e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==== SEASON: 25A ====
## Type III Analysis of Variance Table with Satterthwaite's method
##                             Sum Sq Mean Sq NumDF   DenDF  F value    Pr(>F)    
## Treat_code                 270.620  90.207     3 1139.84 142.7775 < 2.2e-16 ***
## IQF_environment             13.560   4.520     3  553.54   7.1543 0.0001015 ***
## Treat_code:IQF_environment  23.579   2.620     9 1140.14   4.1466 2.817e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==== SEASON: 23A ====
## Type III Analysis of Variance Table with Satterthwaite's method
##                             Sum Sq Mean Sq NumDF  DenDF  F value  Pr(>F)    
## Treat_code                 315.910 105.303     3 1654.2 146.5477 < 2e-16 ***
## IQF_environment              4.243   1.414     3  686.7   1.9682 0.11741    
## Treat_code:IQF_environment  13.302   1.478     9 1654.6   2.0568 0.03032 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==== SEASON: 24B ====
## Type III Analysis of Variance Table with Satterthwaite's method
##                            Sum Sq Mean Sq NumDF   DenDF  F value    Pr(>F)    
## Treat_code                 94.073 31.3577     3 1406.88 378.9046 < 2.2e-16 ***
## IQF_environment             2.195  0.7315     3  700.07   8.8395 9.323e-06 ***
## Treat_code:IQF_environment  1.806  0.2006     9 1407.59   2.4243  0.009863 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==== SEASON: 23B ====
## Type III Analysis of Variance Table with Satterthwaite's method
##                            Sum Sq Mean Sq NumDF  DenDF  F value    Pr(>F)    
## Treat_code                 44.422 14.8075     3 1540.1 184.0492 < 2.2e-16 ***
## IQF_environment             1.045  0.3482     3  697.6   4.3285  0.004914 ** 
## Treat_code:IQF_environment  8.192  0.9102     9 1540.5  11.3135 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==== SEASON: 25B ====
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
## Missing cells for: Treat_codeB:IQF_environmentHighland, Treat_codeD:IQF_environmentHighland.  
## Interpret type III hypotheses with care.
## Type III Analysis of Variance Table with Satterthwaite's method
##                            Sum Sq Mean Sq NumDF   DenDF F value  Pr(>F)    
## Treat_code                 34.379 11.4598     3 1120.51 77.9272 < 2e-16 ***
## IQF_environment             1.500  0.5001     3  450.89  3.4004 0.01775 *  
## Treat_code:IQF_environment  1.258  0.1798     7 1120.47  1.2225 0.28706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Testing the AEZ × Treatment interaction within each season

Given that the three-way interaction (Treatment × Environment × Season) is significant, the correct next step is to analyze each season separately and test for the Treatment × Environment interaction within that season. For each season, we fit a model including the Treat_code × IQF_environment interaction and evaluated its significance.

Notes: The interaction is significant in all seasons except one (25B). This indicates that treatment effects vary across environmental zones for nearly every season, supporting the need for zone-specific post-hoc comparisons.

Models per season

## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons

#Post-hoc comparisons within each Season × AEZ After confirming interaction patterns, we conducted post-hoc comparisons of treatment effects within every Season × AEZ combination. We estimated marginal means per treatment and tested whether the CA treatments (B, C, D) differed from each other.

Across 24 Season–AEZ combinations: -Treatment effect is significant in all 24 combinations. -In none of the 24 combinations do CA treatments (B, C, D) differ significantly from each other; all CA treatments form a statistically homogeneous group.

Notes: Because B, C, and D do not differ from each other in any environment–season context, they can be pooled into a single CA group to increase statistical power when comparing CA vs. Conventional systems in the next steps of the analysis. We will have to clearly explain that we are comparing cropping systems, which contain heterogeneity within them (varieties, management along the seasons, etc)

library(dplyr)
library(emmeans)
library(multcomp)
library(multcompView)

summary_results <- list()

# All combinations of Season × AEZ
season_aez_combos <- yield_n_out %>%
  distinct(IQR_Season, IQF_environment) %>%
  arrange(IQR_Season, IQF_environment)

for (i in 1:nrow(season_aez_combos)) {
  
  season <- season_aez_combos$IQR_Season[i]
  aez    <- season_aez_combos$IQF_environment[i]
  
  sub_data <- yield_n_out %>%
    filter(IQR_Season == season, IQF_environment == aez)
  
  if (length(unique(sub_data$Treat_code)) > 1 && nrow(sub_data) >= 5) {
    
    model <- lm(DC_Yield_t_ha ~ Treat_code, data = sub_data)
    
    # p-value
    anova_p <- anova(model)["Treat_code", "Pr(>F)"]
    
    # letters
    emm  <- emmeans(model, "Treat_code")
    pw   <- multcomp::cld(emm, adjust = "tukey")
    df_pw <- as.data.frame(pw)
    
    # Clean `.group` (" a b" -> "ab")
    df_pw$clean_group <- gsub(" ", "", df_pw$.group)
    
groups_bcd <- df_pw %>%
  filter(Treat_code %in% c("B", "C", "D")) %>%
  dplyr::select(Treat_code, clean_group)

    
    # Default
    bc_d_diff <- "No"
    
    # Only check differences if >= 2 treatments available
    if (nrow(groups_bcd) >= 2) {
      
      # All pairwise treatment letter sets
      combos <- combn(groups_bcd$clean_group, 2, simplify = FALSE)
      
      for (combo in combos) {
        
        g1 <- unlist(strsplit(combo[1], ""))  # split letters
        g2 <- unlist(strsplit(combo[2], ""))
        
        # If NO shared letters → significant difference
        if (length(intersect(g1, g2)) == 0) {
          bc_d_diff <- "Yes"
          break
        }
      }
    }
    
    summary_results[[paste(season, aez, sep = "_")]] <- data.frame(
      IQR_Season = season,
      IQF_environment = aez,
      Treat_pvalue = round(anova_p, 5),
      B_C_D_different = bc_d_diff
    )
  }
}
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
results_table <- bind_rows(summary_results)

How many times effect of trat was significant (p<=0.5) and how many timmes at least 2 CA treatmes were differenc eamong them:

## Significant p-values (<= 0.05): 24
## B/C/D differences detected: 0
## Total Season-AEZ combinations: 24

Plot results with letters and mean values

Combining CA Treatments

Because CA treatments B, C, and D showed no significant differences across any Season × AEZ combination, we combined them into a single CA group. Numerical variables (including yield) were averaged across the three treatments, while categorical variables were taken from treatment C, the most agronomically consistent option. This provides a cleaner dataset and increases power for comparing CA vs. Conventional systems.

Yield comparison of CA vs CONV by AEZ and season

We compared CA vs Conventional (CONV) yields using linear mixed models fitted separately for each Season × AEZ combination. For each model, we extracted; the p-value for the CA vs CONV contrast, and the relative yield defined as:

Relative yield = CA yield / CONV yield

Notes:

Average relative yield (CA / CONV): 0.758, i.e. CA yields are on average about 24% lower than CONV, and all comparisons are significant: 24 out of 24 Season × AEZ combinations show a significant difference (p ≤ 0.05) between CA and CONV. These results indicate that, across all tested environments and seasons, CA yields are consistently and significantly lower than those under conventional tillage.

Average difference and % of significant comparisons

# 1. Average relative yield
average_relative_yield <- mean(yield_comparison_table$relative_yield, na.rm = TRUE)

# 2. Count of significant p-values
n_significant <- sum(yield_comparison_table$p_value <= 0.05, na.rm = TRUE)

# 3. Total number of comparisons
n_total <- nrow(yield_comparison_table)

# Display the results
cat("Average Relative Yield (CA / CONV):", round(average_relative_yield, 3), "\n")
## Average Relative Yield (CA / CONV): 0.758
cat("Significant differences (p ≤ 0.05):", n_significant, "of", n_total, "\n")
## Significant differences (p ≤ 0.05): 24 of 24

#Cumulative distribution of relative yield (Season × AEZ level) We visualized the distribution of relative yield (CA / CONV) across Season × AEZ combinations using a cumulative frequency plot of relative yield (CA / CONV) for the average of season-agzone combinations with:X-axis: Relative yield (sorted increasing).

Notes:

This figure shows that even when targeting at the AEZ level, it is not possible to “pick out” zones where CA has no yield penalty. The signal is highly variable within these broad zones, suggesting that effective targeting would need to happen at the plot or farm level, not just by AEZ. Same but with color profression by season

#Plot-level relative yield (block × season)

We then moved from aggregated Season × AEZ comparisons to the plot level (each block = one farm in one season).We plotted the cumulative frequency of plot-level relative yield, adding a vertical reference line at relative yield = 1 (no yield penalty).

Notes:

At the plot level, a share of plots have little or no yield penalty under CA. Depending on the cut-off used, roughly 15–20% of plots show relative yields close to or above 1, indicating that CA can perform competitively on a subset of farms even though the overall average is negative.

Interpretation of yield comparison between CA and CONV (block–season combinations)

This figure shows the distribution of relative yield (CA / CONV) for each block–season combination. Each point represents the yield performance of one farm in one specific season, comparing CA to its paired CONV value.

Notes: Across AEZs, about 15–20% of block–season observations have relative yields close to or above 1. This means that although CA clearly underperforms on average at the ag-zone level, a meaningful subset of fields in individual seasons shows no yield penalty. This variation highlights that CA performance is highly plot-specific and cannot be reliably predicted using only AEZ-level information.

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

Another way to see it:

#Cumulative yield difference per farm (multi-season average).

Here we summarize the mid-term yield impact (3 years, 6 seasons) of CA at the farm level. For each block (farm), we first compute the relative yield (CA / CONV) at the block–season level and then calculate the average relative yield across seasons (average of the 6 seasonal rations). This gives one value per farm: its mean CA performance relative to Conventional over time.

Notes:

When looking at farms over multiple seasons, again most blocks (82%) have an average relative yield below 1, meaning a cumulative yield penalty under CA. Only a 18% share of farms in each AEZ reaches or exceeds yield parity. What looked as average is a non-viable practice, however it that fraction of farmers that benefit from CA can be targeter, then potential benefits of CA can be exploded.

## # A tibble: 577 × 4
##    IQR_block avg_relative_yield IQF_environment IQF_District
##    <chr>                  <dbl> <chr>           <chr>       
##  1 102_1                  0.828 Mid_Low_Wet     Bugarama    
##  2 102_10                 0.544 Mid_Low_Wet     Bugarama    
##  3 102_11                 0.919 Mid_Low_Wet     Bugarama    
##  4 102_12                 0.436 Mid_Low_Wet     Bugarama    
##  5 102_13                 0.768 Mid_Low_Wet     Bugarama    
##  6 102_14                 1.94  Mid_Low_Wet     Bugarama    
##  7 102_2                  0.726 Mid_Low_Wet     Bugarama    
##  8 102_3                  0.750 Mid_Low_Wet     Bugarama    
##  9 102_4                  0.753 Mid_Low_Wet     Bugarama    
## 10 102_5                  0.781 Mid_Low_Wet     Bugarama    
## # ℹ 567 more rows

Now a figure of cummulative frequency of relative performance

Another way to see frequency of performance of average (the 6 seasons averaged) relative yield of CA

## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_density()`).

#Evolution of CA relative yield performance across six seasons

While looking at the cummulative yield difference between conventional and CA is useful from an agronomic perspective, it is missess relevance fro farmers. For example, farmers can’t afford large lossess in the initial seasons even if there is s compensation 2 or 3 years after. So its important to look at the effect of CA as a video and not as a picture.

We first try to understand whether farms consistently perform well or poorly under CA. For that we divide each season the blocks in good, mis and poor performers dividing the yield response (Realtive yield of CA = Ca/CONV) in three 3rds high, mid and low. Using these seasonal classifications, we create alluvial diagrams to visualize how each block’s performance group changes across seasons.

Note:

The alluvial plots show that blocks rarely stay in the same performance group from season to season. Many farms move between high, mid, and low categories, indicating that CA performance is highly unstable over time at the farm level. Only a small number of blocks consistently remain high-performing or consistently remain low-performing across all seasons.

## Warning: package 'ggalluvial' was built under R version 4.3.3
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Another way to explore the evolution of performance is the correlation of performance along the seasons, specially with season 1. That is having good performance is season 1 correlates better with the rest? And with overall?

## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded

# Classification of seasonal CA performance using absolute thresholds As an alternative to tercile-based rankings, we also classify each block–season combination using a set of absolute thresholds for relative yield (CA / CONV). This provides a more interpretable scale of performance: -Very poor: < 0.90 -Poor: 0.90–0.95 -Acceptable: 0.95–1.00 -Good: ≥ 1.00 We add this classification to the block_yield dataset for each season and block. Using these categories, we generate alluvial diagrams to visualize how farms transition between performance groups across seasons.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Typology of multi-season CA performance (23A–25B)

To describe how farms perform under CA over time, we created a typology of block-level responses using their relative yield (CA / CONV) across six seasons (23A–25B). Only blocks with data in at least four seasons were retained to ensure stable classification.

For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: Average ≥ 0.95 and no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.90 and 0.95 -Very Poor: Average ≤ 0.90

These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.

Notes:once again we see that there is around 18% of plots that could have an acceptable yield performance of the CA. This is of cource not considering all the soild benefits of this plots which are highly afftected by water erosion under conventional tillage, specially in maize-beans rotations. An important consideration is that this indictes that about 18 % of the plots coudl potentially be targeted with CA (if we just consider yield), however, farmers in Rwanda own on average 2.5 plots per hosehold (NISR 2023) so that would be about 39% of the farmers having at least one of their plots with acceptable performance of CA. Question remains…can we pre-identify this farmers and plots within the farm? Is it possible then to use that information to target the recommendatiosn to these farmers and plots within their farmes?… we will explore that below.

library(dplyr)
library(tidyr)

# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")

# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- block_yield %>%
  filter(IQR_Season %in% seasons, is.finite(Relative_Yield)) %>%
  group_by(IQR_block) %>%
  summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
  filter(n_valid_seasons >= 4)

# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- block_yield %>%
  filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
         IQR_Season %in% seasons,
         is.finite(Relative_Yield))

# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
  group_by(IQR_block, IQR_Season) %>%
  summarise(Relative_Yield = mean(Relative_Yield, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = IQR_Season,
    values_from = Relative_Yield,
    names_prefix = "Relative_Yield_"
  )

# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
  group_by(IQR_block) %>%
  summarise(
    n_seasons = n_distinct(IQR_Season),
    any_below_0.95 = any(Relative_Yield < 0.95),
    mean_ratio = mean(Relative_Yield, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    CA_typology = case_when(
      !any_below_0.95 ~ "Good",
      any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
      any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.90 ~ "Poor",
      mean_ratio <= 0.90 ~ "Very Poor"
    )
  )

# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- block_yield %>%
  filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
  arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>%  # Prioritize 23A > 24A > 25A
  distinct(IQR_block, .keep_all = TRUE) %>%
  dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)


# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
  left_join(RY_wide, by = "IQR_block") %>%
  left_join(block_meta, by = "IQR_block")

Now we analyse the frequency of each type

Same by environment:

Now we explore what are the drivers of performance in each season

With the selected variables we explore better one by one

ICR_N_perc, ICR_Org_C_perc, ICR_Avail_P_ppm, ICR_K_meq_100g, IQR_TF_tubura_client, ICR_age_hh_head, IQR_TF_role, ICR_HH_size, ICR_arable_land_owned, ICF_Lat, ICF_Long, ICF_Alt, IQR_plot_fert_quality, IQR_soil_texture, IQR_soil_color, IQF_Position_landscape, IQF_position_slope, IQF_Weed_specie, IQR_compost_quality, ICR_compost_tn_ha, IQR_types_compost, IQR_basic_crop, ICF_planting_date, ICR_avg_daily_rainfall

Nitrogen pre-planting - ICR_N_perc -

Colelcted only season 23A. Notes: Keep. has low N of missing values, high CV.

## # A tibble: 1 × 6
##   Total Missing Available  Mean    SD    CV
##   <int>   <int>     <int> <dbl> <dbl> <dbl>
## 1   556       4       552 0.221 0.137 0.621

Organic C - ICR_Org_C_perc -

Notes: Keep. for this one we have 3 seasons. Since they are all very complete, I will use the average of 23A and 24A measurements for the control treatment (Conventinal, A) I think this will give better estimate of the levels of OM of a block and in 2 years only very little change of OM in 0-30cm depth should be expected.

## # A tibble: 3 × 7
##   IQR_Season Total Missing Available  Mean    SD    CV
##   <chr>      <int>   <int>     <int> <dbl> <dbl> <dbl>
## 1 23A          556       4       552  2.10 0.878 0.419
## 2 24A          467      29       438  2.46 1.37  0.560
## 3 25A          444     160       284  3.19 1.54  0.485

K - ICR_K_meq_100g -

Notes: Keep. Colelcted only season 23A. Keep ir, has low N of missing values, high CV.

## # A tibble: 3 × 7
##   IQR_Season Total Missing Available    Mean     SD     CV
##   <chr>      <int>   <int>     <int>   <dbl>  <dbl>  <dbl>
## 1 23A          556       4       552   0.510  0.494  0.968
## 2 24A          467     467         0 NaN     NA     NA    
## 3 25A          444     444         0 NaN     NA     NA

Available P. - ICR_Avail_P_ppm -

Notes: Keep. We will use only TreatA as tillage may make more uniform the samplig within the plot. We will use average of the 3 samplings ti increase quality. Now sure whay so much difference among seasons of sampling. One would expect increase if any (becasue of DAP and compost applications)

## # A tibble: 3 × 9
##   IQR_Season Total Missing Available  Mean    SD    CV   Min   Max
##   <chr>      <int>   <int>     <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A          556       4       552  21.2  33.0  1.56  1.9   310 
## 2 24A          467      29       438  27.7  35.6  1.29  1.12  193.
## 3 25A          444     160       284  21.6  29.3  1.35  1.52  164.

HH Size - ICR_HH_size -

Notes: NOT Keep, not sure it is really a factor to consider, we just stay with biophisical+management only (or non-biophisical that we are sure relate to biophisicals + management we can not measure)

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `is.finite(as.numeric(ICR_HH_size))`.
## Caused by warning:
## ! NAs introduced by coercion

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `ICR_HH_size = as.numeric(ICR_HH_size)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 3 × 10
##   IQR_Season Total Missing Available  Mean    SD    CV Non_NA_Non_Zero   Min
##   <chr>      <int>   <int>     <int> <dbl> <dbl> <dbl>           <int> <dbl>
## 1 23A          556       3       553  5.58  2.09 0.375             553     1
## 2 24A          467       0       467  5.69  2.00 0.352             467     1
## 3 25A          444       2       442  5.49  2.00 0.364             442     1
## # ℹ 1 more variable: Max <dbl>

Arable land Owned -ICR_arable_land_owned-

Comclusion: Keep. While not biophisical, may be related to them and to historical management, other crops, mulch pressure, etc. Something wrong with 23B. use average of all the rest

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `is.finite(as.numeric(ICR_arable_land_owned))`.
## Caused by warning:
## ! NAs introduced by coercion

### ALtitude -ICF_Alt- Notes: Keep. Altitude was recorded each season. What is good as we can do average of recordingas and increase quality. We had a good CV, however, scores for 25A are a little off so for the averages we will exclude that season ### Rain (average daily) - ICR_avg_daily_rainfall - Notes. Keep. We will use the average of the 6 seasons but later we can explore better rain accumulated at different periods (ej.g., if we assume mucl can help early dorughts etc)

## # A tibble: 3 × 7
##   IQR_Season Total Missing Available  Mean    SD    CV
##   <chr>      <int>   <int>     <int> <dbl> <dbl> <dbl>
## 1 23A          556       0       556  5.04  1.67 0.332
## 2 24A          467      11       456  5.20  1.95 0.375
## 3 25A          444      12       432  4.15  1.49 0.359

Compost tons per ha - ICR_compost_tn_ha -

Notes: Keep. We can use average of the 6 season for the farmer as a management factor (biophisical + management). We also use quality and source of compost as corector factors for amount (see below)

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `ICR_compost_tn_ha = as.numeric(ICR_compost_tn_ha)`.
## Caused by warning:
## ! NAs introduced by coercion

Now we explore the qualitative variables

Tubura client - IQR_TF_tubura_client -

Notes: Keep. Not biophisical or management but related to them. Not perfectly balanced (ranging form 58 to 75 % of tubura clients) but goo denough as to beinclued considering how relevant it can be. A binary variable is often considered too imbalanced if one class is >90–95%, especially in ML models, so are well below that threshold.

Plot fertility as perceibed by the farmer - IQR_plot_fert_quality -

Notes: Keep with caution, “low” is very low represented (may consider to combine with medium), and Medium is very over-represented ### Now for soil texture - IQR_soil_texture - Notes: Keep. We have a nice representativeness of different soils and quite balanced althouth the 3-Clay may be better to combine in 1 as they have few reps in each cathegory. For the ones that are NA in season 23A, get the data from 24A of 24B ### Soil color -IQR_soil_color- Notes: Keep. We have a nice representativeness of different soils and quite balanced althouth better combine red less with red more. Also see if can combine the soil in KY with other. For the ones that are NA in season 23A, get the data from 24A of 24B ### Position in the lanscape - IQF_Position_landscape - Notes: Keep but extremely unbalanced so with caution. The category hills completely dominates (~90%), the other two categories are rare, each <5%. In mix models the low N cathegories will have high SE, nit very informative. RF handles imbalanced categorical predictors fine, but will provably have low relevance (even if it may have it). SO maybe include it in a first ran of mix and RF see the results and base on that remove. They can look indivudually at how the scores of relative yield are in each environment for each category (maybe looking for the blocks that are most similar for other factors Paired comparison?) ### Position on the slope - IQF_position_slope - Notes: Keep but try to find/impute the 10% missing data (not recorede in other seasons, but maybe can be imputed from other variables like position in the landscape or if we measure slope later). As is can be used for mixed models although limited as quite umbalanced and several cathegoreis low represented. For RF is of as is

## 🔎 Total NA blocks in 23A: 0
## ✅ Recovered from 24A: 0

Weed species IQR_Weedtype1_specie - IQR_Weedtype1_specie -

Notess: Keep. But Generally not suitable for mix models and even for RF not useful in its current form. Having too many rare categories (singletons) would cause unstable model estimation, produce large standard errors and potentially lead to singular fits or convergence issues. We can group the ones in minority by weed type (broad leaf and grasses). We Keep individually becuase they have large number: Couch_grass
Galinsoga_parviflora And weechweed because of its relevance (striga). It has N=19, what is not so bad. Correct urukuta = Commelina_benghalensis

### IQR_compost_quality - IQR_compost_quality - For compose we need to get average of the seasons, so we need to conver the qualitative scale to numeric and then do average. “poor” → 1 “average” → 2 “well_decomposed” → 3. While there is almost half of farmers that are very onsistent on having good, average of poor compost along the seaosns,another amount moves from one to another. So we better do average of the score along seasons and then use the numerical scale rather than the factorial Not sure how to use compost quality tough…it seems that it may no be an indicator strong enough on its own but we can also think about using it as a correction factor for compost amount; e.g. average compost leaves amount =, food it increases by a % and poor decreses it by another %??? *Note: interesting to seee how there seems to be a trent of better quality in zones with higuer temperatures…what could be expected by faster decomposition Here an analysis hut to check how cuality changes across environments and that areas with lower T have lower quality (less decomposition rate???)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: compost_score ~ IQF_environment + (1 | IQR_block)
##    Data: block_yield
## 
## REML criterion at convergence: 3968.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7775 -0.4483  0.1083  0.4283  3.5108 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  IQR_block (Intercept) 0.1161   0.3407  
##  Residual              0.1737   0.4168  
## Number of obs: 2884, groups:  IQR_block, 569
## 
## Fixed effects:
##                              Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)                   2.35154    0.02978  810.11228  78.969  < 2e-16
## IQF_environmentMid_Low_Dry    0.42231    0.03615 1353.05329  11.682  < 2e-16
## IQF_environmentMid_Low_Wet    0.11896    0.04020 1009.85997   2.959  0.00316
## IQF_environmentTransition    -0.05641    0.04560  990.85563  -1.237  0.21636
##                               
## (Intercept)                ***
## IQF_environmentMid_Low_Dry ***
## IQF_environmentMid_Low_Wet ** 
## IQF_environmentTransition     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IQF_M_L_D IQF_M_L_W
## IQF_nvM_L_D -0.711                    
## IQF_nvM_L_W -0.719  0.580             
## IQF_nvrnmnT -0.633  0.466     0.563
## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons
##  IQF_environment emmean     SE   df lower.CL upper.CL .group
##  Transition        2.30 0.0353 1025     2.21     2.38  1    
##  Highland          2.35 0.0298  845     2.28     2.43  1    
##  Mid_Low_Wet       2.47 0.0280 1000     2.40     2.54   2   
##  Mid_Low_Dry       2.77 0.0257  942     2.71     2.84    3  
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 4 estimates 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05 
## NOTE: If two or more means share the same grouping symbol,
##       then we cannot show them to be different.
##       But we also did not show them to be the same.

Now we explore if there is a trend on farmers that usually used good /poor compost or change a lot from season to season

## Warning in setup_data(...): Some differentiation aesthetics vary within alluvia, and will be diffused by their first value.
## Consider using `geom_flow()` instead.

## Warning in setup_data(...): Some differentiation aesthetics vary within alluvia, and will be diffused by their first value.
## Consider using `geom_flow()` instead.

## Warning in setup_data(...): Some differentiation aesthetics vary within alluvia, and will be diffused by their first value.
## Consider using `geom_flow()` instead.

### Compost types - IQR_types_compost - Notes: Do not keep, most on the same, too unbalanced. In any case can then evluate individually ### Basic crop Notes: Not to keep. Althous we can explore how buch and beans behave udner CA. Limitations is that there will be lots of confunding factors associeted to the specie, but maybe we can find some that are similar context with differnet crop?? ### Planting date Notes. Keep. We have a nice range within each district-season, what makes it very meaningful the planting dates comparison. Another relevant management practice. However, since planting date is relative, to others is same context, we will make it relative to the 1st planting date in the same district the same season. So we ID the first planting date and give a 1 and from there we add delas from that.

First we make sure we align formationgs of dates. Then we create a verable that is a date that is more meaningful; days from the time the first block was planted inthe same district-season combination.

## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Figure

So the overall Notess of variables we keep fro drivers analysis: N – ICR_N_perc. Collected only in 23A; low missing values but high CV.Keep as is; single-season snapshot. Organic C – ICR_Org_C_perc. Available for 3 seasons; very complete. Use the average of 23A and 24A for conventional plots to better represent block OM (minimal expected OM change over 2 years). K – ICR_K_meq_100g. Collected only in 23A; low missing values, high CV. Keep as is. Available P – ICR_Avail_P_ppm. Use only Treatment A (to avoid tillage sampling effects). Average the 3 sampling seasons to improve quality; unclear why values differ so much. Arable Land Owned – ICR_arable_land_owned. Socio-management variable; may relate to soil, crops, mulch, history. Exclude 23B (erroneous) and use average of remaining data. Altitude – ICF_Alt. Recorded each season; good CV. Average across seasons but exclude 25A (values off). Rainfall (avg daily) – ICR_avg_daily_rainfall. Use average across 6 seasons for now. Later explore rainfall accumulation during key crop periods. Compost (t/ha) – ICR_compost_tn_ha. Use 6-season average as a management factor. Include compost source/quality as correction factors. Tubura Client – IQR_TF_tubura_client. Binary, moderately imbalanced (58–75% “yes”); acceptable for models. Relevant socio-management factor; include. Perceived Plot Fertility – IQR_plot_fert_quality. “Low” barely represented; consider merging with “Medium.Medium over-represented, so use with caution. Soil Texture – IQR_soil_texture. Generally well-represented and balanced. Combine rare clay categories; fill 23A missing values using 24A/24B. Soil Color – IQR_soil_color. Good distribution; may merge “red less” with “red more.” Check whether KY combines with another group; fill missing as above. Position in the Landscape – IQF_Position_landscape. Extremely unbalanced (~90% hills). Include initially in models, but likely low information; inspect performance before final inclusion. Position on the Slope – IQF_position_slope. 10% missing; attempt imputation from related variables. Usable in mixed models but unbalanced; RF can handle as is. Weed Species – IQR_Weedtype1_specie. Too many rare categories for modeling; group minority species into grass/broadleaf. Keep key species separately: couch grass, Galinsoga parviflora, and witchweed (Striga, N=19). Compost Quality – IQR_compost_quality. Convert qualitative scale to numeric (1–3) and average across seasons. Potentially useful as correction factor for compost amount; quality trends relate to warmer zones.

Now we prepare the database that will have the selected drivers properly calculated

Calculations will be one per season-block combination (“seasonal”) and then one that will be calculated for the block along seasons and used when we evaluate divers of avege along seasons or the “dynamic performance” (“combined”)

We strat by creating a database called “yield_drivers” that we will then join to block_yield

Driver 1: ICR_N_perc We now extract the data from 23A (only collected that season) and add it to drivers

check

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0500  0.1500  0.1800  0.2213  0.2500  1.0800      18

Driver 2: ICR_Org_C_perc
for this one we have 3 seasons. Since they are all very complete, I will use the average of 23A and 24A measurements for the control treatment (Conventinal, A) I think this will give better estimate of the levels of OM of a block and in 2 years only very little change of OM in 0-30cm depth should be expected. So we will get it from the database yield_CA_comb, filtering for Treat_group = CONV

check

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.480   1.620   2.100   2.272   2.763   9.930       2

Driver 3: ICR_K_meq_100g We now extract the data from 23A (only collected that season) and add it to drivers

check

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0500  0.2000  0.3300  0.5101  0.6600  3.4100      18

Driver 4 ICR_Avail_P_ppm We will use only Treat A as tillage may make more uniform the samplig within the plot. We will use average of the 3 samplings ti increase quality. Now sure whay so much difference among seasons of sampling. One would expect increase if any (becasue of DAP and compost applications)

check

## Warning: Unknown or uninitialised column: `ICR_Avail_P_ppm`.
## Length  Class   Mode 
##      0   NULL   NULL

Driver 5 ICR_arable_land_owned
While not biophisical, may be related to them and to historical management, other crops, mulch pressure, etc. Something wrong with 23B. use average of all the rest

check

## Warning: Unknown or uninitialised column: `ICR_arable_land_owned`.
## Length  Class   Mode 
##      0   NULL   NULL

Driver 6 ICF_Alt Altitude was recorded each season. What is good as we can do average of recordingas and increase quality. We had a good CV, however, scores for 25A are a little off so for the averages we will exclude that season. Also values <500 are not included in the avergaes

Check

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1001    1461    1660    1692    1877    2434       1

Driver 7 ICR_avg_daily_rainfall We will use the average of the 6 seasons but later we can explore better rain accumulated at different periods (ej.g., if we assume mucl can help early dorughts etc). Use only CONV (Treat_group = CONV). If rainfall is 0 or NA, replace using: same season type (A with A, B with B) from previous or next year if available Example: Replace missing 24A with value from 23A or 25A Replace missing 24B with value from 23B or 25B Compute block-level average rainfall after imputation.Join into yield_drivers.

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6817  3.3583  3.9783  4.1437  4.8850  9.5367

Driver 8 ICR_compost_tn_ha We use average of the 6 season for the farmer as a management factor (biophisical + management). We also can use quality and source of compost as corector factors for amount, but for now we usem individually

Check:

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   8.787  10.544  11.009  13.032  35.037

Driver 9 IQR_TF_tubura_client Not biophisical or management but related to them. Not perfectly balanced (ranging form 58 to 75 % of tubura clients) but good enough as to be inclued considering how relevant it can be. A binary variable is often considered too imbalanced if one class is >90–95%, especially in ML models, so are well below that threshold.

Check

## 
##  --- #N/A    0    1 
##    3    1  192  373

Driver 10 IQR_plot_fert_quality
WE will use with caution, “low” is very low represented (may consider to combine with medium). We use 23A data

cehck

## 
##         ---        high low_quality      medium        <NA> 
##           2          76          14         474           3

See if we can find the missing data in other seasons

## [1] "43_1" "44_3"
library(dplyr)

blocks_to_check <- c("43_1", "44_3")

Fert_check <- yield_CA_comb %>%
  filter(IQR_block %in% blocks_to_check) %>%
  dplyr::select(IQR_block, IQR_Season, Treat_group, IQR_plot_fert_quality) %>%
  arrange(IQR_block, IQR_Season, Treat_group)

Fert_check
## # A tibble: 16 × 4
##    IQR_block IQR_Season Treat_group IQR_plot_fert_quality
##    <chr>     <chr>      <chr>       <chr>                
##  1 43_1      23A        CA          ---                  
##  2 43_1      23A        CONV        ---                  
##  3 43_1      23B        CA          medium               
##  4 43_1      23B        CONV        medium               
##  5 44_3      23A        CA          ---                  
##  6 44_3      23A        CONV        ---                  
##  7 44_3      23B        CA          medium               
##  8 44_3      23B        CONV        medium               
##  9 44_3      24A        CA          medium               
## 10 44_3      24A        CONV        medium               
## 11 44_3      24B        CA          medium               
## 12 44_3      24B        CONV        medium               
## 13 44_3      25A        CA          medium               
## 14 44_3      25A        CONV        medium               
## 15 44_3      25B        CA          medium               
## 16 44_3      25B        CONV        medium

Replase the 2 — in yield_drivers

Driver 11 IQR_soil_texture We have a nice representativeness of different soils and quite balanced althouth the 3-Clay may be better to combine in 1 as they have few reps in each cathegory. For the ones that are NA in season 23A, get the data from 24A of 24B

Now we try to see if we can find values for the NA is other seasons

## # A tibble: 116 × 4
##    IQR_block IQR_Season Treat_group IQR_soil_texture   
##    <chr>     <chr>      <chr>       <chr>              
##  1 132_2     24B        CA          F_(sandy_clay_loam)
##  2 132_2     24B        CONV        F_(sandy_clay_loam)
##  3 132_2     25A        CA          F_(sandy_clay_loam)
##  4 132_2     25A        CONV        F_(sandy_clay_loam)
##  5 132_2     25B        CA          F_(sandy_clay_loam)
##  6 132_2     25B        CONV        F_(sandy_clay_loam)
##  7 132_3     24B        CA          F_(sandy_clay_loam)
##  8 132_3     24B        CONV        F_(sandy_clay_loam)
##  9 132_3     25A        CA          F_(sandy_clay_loam)
## 10 132_3     25A        CONV        F_(sandy_clay_loam)
## # ℹ 106 more rows
## # A tibble: 70 × 12
##    IQR_block ICR_N_perc_23A ICR_Org_C_avg ICR_K_perc_23A ICR_Avail_P_avg
##    <chr>              <dbl>         <dbl>          <dbl>           <dbl>
##  1 10_12               0.19          2.02           2.7            148  
##  2 10_9                0.13          1.78           0.88            56  
##  3 102_1               0.26          2.69           0.73             8.2
##  4 102_8               0.13          1.46           0.29            20.2
##  5 102_9               0.24          2.88           0.96             8.2
##  6 103_12              0.45          2.12           0.36             2.3
##  7 103_4               0.21          2.14           1.03             2.7
##  8 110_11              0.34          2.69           0.25             8.5
##  9 111_2               0.46          3.96           0.16             7  
## 10 111_5               0.33          3.32           0.22             9.7
## # ℹ 60 more rows
## # ℹ 7 more variables: ICR_arable_land_avg <dbl>, ICF_Alt_avg <dbl>,
## #   ICR_rainfall_avg <dbl>, ICR_compost_avg <dbl>, IQR_TF_tubura_client <chr>,
## #   IQR_plot_fert_quality <chr>, IQR_soil_texture <chr>
## # A tibble: 70 × 12
##    IQR_block ICR_N_perc_23A ICR_Org_C_avg ICR_K_perc_23A ICR_Avail_P_avg
##    <chr>              <dbl>         <dbl>          <dbl>           <dbl>
##  1 10_12               0.19          2.02           2.7            148  
##  2 10_9                0.13          1.78           0.88            56  
##  3 102_1               0.26          2.69           0.73             8.2
##  4 102_8               0.13          1.46           0.29            20.2
##  5 102_9               0.24          2.88           0.96             8.2
##  6 103_12              0.45          2.12           0.36             2.3
##  7 103_4               0.21          2.14           1.03             2.7
##  8 110_11              0.34          2.69           0.25             8.5
##  9 111_2               0.46          3.96           0.16             7  
## 10 111_5               0.33          3.32           0.22             9.7
## # ℹ 60 more rows
## # ℹ 7 more variables: ICR_arable_land_avg <dbl>, ICF_Alt_avg <dbl>,
## #   ICR_rainfall_avg <dbl>, ICR_compost_avg <dbl>, IQR_TF_tubura_client <chr>,
## #   IQR_plot_fert_quality <chr>, IQR_soil_texture <chr>

Driver 12 IQR_soil_color We have a nice representativeness of different soils and quite balanced althouth better combine red less with red more. For the ones that are NA in season 23A, get the data from 24A of 24B

## # A tibble: 89 × 13
##    IQR_block ICR_N_perc_23A ICR_Org_C_avg ICR_K_perc_23A ICR_Avail_P_avg
##    <chr>              <dbl>         <dbl>          <dbl>           <dbl>
##  1 10_12               0.19          2.02           2.7            148  
##  2 10_9                0.13          1.78           0.88            56  
##  3 102_1               0.26          2.69           0.73             8.2
##  4 102_8               0.13          1.46           0.29            20.2
##  5 102_9               0.24          2.88           0.96             8.2
##  6 103_12              0.45          2.12           0.36             2.3
##  7 103_4               0.21          2.14           1.03             2.7
##  8 110_11              0.34          2.69           0.25             8.5
##  9 111_2               0.46          3.96           0.16             7  
## 10 111_5               0.33          3.32           0.22             9.7
## # ℹ 79 more rows
## # ℹ 8 more variables: ICR_arable_land_avg <dbl>, ICF_Alt_avg <dbl>,
## #   ICR_rainfall_avg <dbl>, ICR_compost_avg <dbl>, IQR_TF_tubura_client <chr>,
## #   IQR_plot_fert_quality <chr>, IQR_soil_texture <chr>, IQR_soil_color <chr>

Driver 13 IQF_Position_landscape Extremely unbalanced so with caution. The category hills completely dominates (~90%), the other two categories are rare, each <5%. For now we leave it as si, but then we can combine them, or we just remove it since this will be somehow captured by position on the slope

## 
## FALSE  TRUE 
##   566     3
## [1] "hills" "hills" "hills" "hills" "hills" "hills"

now we complete missing data from 23 A CONV with other seasons and CA

## 
## FALSE 
##   569
## 
##              ---            hills Radical_terraces           Valley 
##                3              514               25               27

check

Driver 14 IQF_position_slope Find/impute the 10% missing data (not recorede in other seasons, but maybe can be imputed from other variables like position in the landscape or if we measure slope later). As is can be used for mixed models although limited as quite umbalanced and several cathegoreis low represented. For RF is of as is

check

Try to fill with data from other seasons/ and CA

## 
## Final slope distribution:
## 
##       --- Backslope      Flat Footslope  Shoulder    Summit 
##        56       304        43        76        54        36
## 
## Missing after fill:
## [1] 0

Try to imputate using data from position on the landscape; if valley or terraze = “Flat”

check Driver 15 IQR_Weedtype1_specie
Having too many rare categories (singletons) would cause unstable model estimation, produce large standard errors and potentially lead to singular fits or convergence issues. I grouped the ones in minority by weed type (broad leaf and grasses). We Keep individually becuase they have large number the Couch_grass and Galinsoga_parviflora, and weechweed because of its relevance (striga; with N=19, what is not so bad). I also corrected urukuta = Commelina_benghalensis

First for season A, getting from 23A

## 
## FALSE  TRUE 
##   554    15

Now we fill gaps for season A with 24A and for season B with 24B

Replace “urukuta” with “Commelina_benghalensis”

check Now we keep priginal specie ID for couch grass and Galinsoga and Stiga, but for the rest we clasiffy in perenial or annual AND grass or broadleaf

Check distribution again

Driver 16 IQR_compost_quality For compose we need to get average of the seasons, so we need to conver the qualitative scale to numeric and then do average. “poor” → 1 “average” → 2 “well_decomposed” → 3. Not sure how to use compost quality tough…it seems that it may no be an indicator strong enough on its own but we can also think about using it as a correction factor for compost amount; e.g. average compost leaves amount =, food it increases by a % and poor decreses it by another %??? For now we just keep it individual *Note: interesting to seee how there seems to be a trent of better quality in zones with higuer temperatures…what could be expected by faster decomposition

Check

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000   2.250   2.750   2.597   3.000   3.000       1

We will create a compost quantyty corrected by compost quality to reduce the number of variables. This will be called Comp_amount_corr_quali in the database yield_drivers we will make a new variable called “Comp_amount_corr_quali” that will be calculated as yield_drivers:ICR_compost_avg corrected based on “yield_drivers:IQR_compost_quality_avg” as correction factor = (“yield_drivers:IQR_compost_quality_avg”/5 ) * 1.2

yield_drivers <- yield_drivers %>%
  mutate(
    Comp_amount_corr_quali =
      ICR_compost_avg * ((IQR_compost_quality_avg / 5) * 1.2)
  )

Driver 17 ICF_planting_date We have a nice range within each district-season, what makes it very meaningful the planting dates comparison. Another relevant management practice. However, since planting date is relative, to others is same context, we will make it relative to the 1st planting date in the same district the same season. So we ID the first planting date and give a 1 and from there we add delas from that.

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   8.333  12.667  13.749  17.667  45.000

#Now we have the 17 drivers at the “Block” level we can evaluate some correlations and ponetial eliminatinos/modifications First Merge yield drivers into block_yield database;

Now we check for edundant predictors; identify multicollinearity and try to understand relationships among numeric AND categorical drivers

Split drivers into numeric and categorical

Correlation matrix for NUMERIC predictors Notes: - Compost_avg, compost_quality_avg, rainfall_avg, and altitude form a strong multicollinearity cluster. - Rain and altitude are a well known correlation, and altitude is also highly correlated to the “Environmental” class. So we will not use altitude and we will keep rain (as is for these specific seasons and not the historical) and the environmental class. - Compost quality cound be used a a factor to correct compost cuantity. I will us N is also highly corretated to soil C so will just keep soil C for now - Position in the slope and position in the landscape are also higuly correlated. Posiion in the slope gives us most information since the bast mayority of the farmers are in the hills. SO we will only keep position in the slope

Categorical vs categorical: Cramer’s V matrix Notes Position in the slope and position in the landscape my by redundat Numeric vs categorical: ANOVA or Eta²: Eta (η) measures how strongly a numeric variable differs across the categories of a categorical variable, indicating how much of the numeric variance is explained by group differences. Notes: as expected altitude and OAF-AEZ (Environment) are very correlated, maybe we wliminate altitude

## Warning: package 'DescTools' was built under R version 4.3.3
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
##                    Category                 Numeric         Eta2
## 1                 IQR_block          ICR_N_perc_23A 1.000000e+00
## 2                 IQR_block           ICR_Org_C_avg 1.000000e+00
## 3                 IQR_block          ICR_K_perc_23A 1.000000e+00
## 4                 IQR_block         ICR_Avail_P_avg 1.000000e+00
## 5                 IQR_block     ICR_arable_land_avg 1.000000e+00
## 6                 IQR_block             ICF_Alt_avg 1.000000e+00
## 7                 IQR_block        ICR_rainfall_avg 1.000000e+00
## 8                 IQR_block         ICR_compost_avg 1.000000e+00
## 9                 IQR_block IQR_compost_quality_avg 1.000000e+00
## 10                IQR_block  Comp_amount_corr_quali 1.000000e+00
## 11                IQR_block       ICF_planting_date 1.000000e+00
## 12                IQR_block      Relative_Yield_avg 1.000000e+00
## 13     IQR_TF_tubura_client          ICR_N_perc_23A 3.247173e-05
## 14     IQR_TF_tubura_client           ICR_Org_C_avg 4.626583e-07
## 15     IQR_TF_tubura_client          ICR_K_perc_23A 8.772320e-03
## 16     IQR_TF_tubura_client         ICR_Avail_P_avg 2.586411e-03
## 17     IQR_TF_tubura_client     ICR_arable_land_avg 6.518245e-02
## 18     IQR_TF_tubura_client             ICF_Alt_avg 4.163909e-03
## 19     IQR_TF_tubura_client        ICR_rainfall_avg 2.951319e-03
## 20     IQR_TF_tubura_client         ICR_compost_avg 4.932751e-04
## 21     IQR_TF_tubura_client IQR_compost_quality_avg 2.117516e-03
## 22     IQR_TF_tubura_client  Comp_amount_corr_quali 6.698669e-04
## 23     IQR_TF_tubura_client       ICF_planting_date 5.451103e-04
## 24     IQR_TF_tubura_client      Relative_Yield_avg 8.730015e-03
## 25    IQR_plot_fert_quality          ICR_N_perc_23A 7.872584e-03
## 26    IQR_plot_fert_quality           ICR_Org_C_avg 9.416250e-04
## 27    IQR_plot_fert_quality          ICR_K_perc_23A 1.764909e-02
## 28    IQR_plot_fert_quality         ICR_Avail_P_avg 2.222547e-03
## 29    IQR_plot_fert_quality     ICR_arable_land_avg 6.126301e-03
## 30    IQR_plot_fert_quality             ICF_Alt_avg 3.976777e-02
## 31    IQR_plot_fert_quality        ICR_rainfall_avg 2.018969e-03
## 32    IQR_plot_fert_quality         ICR_compost_avg 6.639248e-03
## 33    IQR_plot_fert_quality IQR_compost_quality_avg 1.199915e-02
## 34    IQR_plot_fert_quality  Comp_amount_corr_quali 5.146812e-03
## 35    IQR_plot_fert_quality       ICF_planting_date 5.601602e-03
## 36    IQR_plot_fert_quality      Relative_Yield_avg 5.023419e-03
## 37         IQR_soil_texture          ICR_N_perc_23A 1.222361e-01
## 38         IQR_soil_texture           ICR_Org_C_avg 1.125896e-01
## 39         IQR_soil_texture          ICR_K_perc_23A 7.141605e-02
## 40         IQR_soil_texture         ICR_Avail_P_avg 1.140891e-01
## 41         IQR_soil_texture     ICR_arable_land_avg 6.149066e-02
## 42         IQR_soil_texture             ICF_Alt_avg 7.462161e-02
## 43         IQR_soil_texture        ICR_rainfall_avg 1.988667e-01
## 44         IQR_soil_texture         ICR_compost_avg 1.123132e-01
## 45         IQR_soil_texture IQR_compost_quality_avg 1.393339e-01
## 46         IQR_soil_texture  Comp_amount_corr_quali 7.564741e-02
## 47         IQR_soil_texture       ICF_planting_date 1.062138e-01
## 48         IQR_soil_texture      Relative_Yield_avg 6.525426e-02
## 49           IQR_soil_color          ICR_N_perc_23A 4.795614e-02
## 50           IQR_soil_color           ICR_Org_C_avg 5.017081e-02
## 51           IQR_soil_color          ICR_K_perc_23A 5.358184e-02
## 52           IQR_soil_color         ICR_Avail_P_avg 4.798314e-02
## 53           IQR_soil_color     ICR_arable_land_avg 4.928712e-02
## 54           IQR_soil_color             ICF_Alt_avg 4.915914e-02
## 55           IQR_soil_color        ICR_rainfall_avg 5.758434e-02
## 56           IQR_soil_color         ICR_compost_avg 4.583406e-02
## 57           IQR_soil_color IQR_compost_quality_avg 5.980268e-02
## 58           IQR_soil_color  Comp_amount_corr_quali 2.326852e-02
## 59           IQR_soil_color       ICF_planting_date 5.411899e-02
## 60           IQR_soil_color      Relative_Yield_avg 1.128132e-02
## 61   IQF_Position_landscape          ICR_N_perc_23A 1.039340e-01
## 62   IQF_Position_landscape           ICR_Org_C_avg 7.477678e-02
## 63   IQF_Position_landscape          ICR_K_perc_23A 2.012431e-03
## 64   IQF_Position_landscape         ICR_Avail_P_avg 1.886601e-03
## 65   IQF_Position_landscape     ICR_arable_land_avg 8.570925e-04
## 66   IQF_Position_landscape             ICF_Alt_avg 1.220450e-02
## 67   IQF_Position_landscape        ICR_rainfall_avg 2.030170e-03
## 68   IQF_Position_landscape         ICR_compost_avg 1.965661e-03
## 69   IQF_Position_landscape IQR_compost_quality_avg 7.805974e-03
## 70   IQF_Position_landscape  Comp_amount_corr_quali 8.991847e-03
## 71   IQF_Position_landscape       ICF_planting_date 1.183048e-02
## 72   IQF_Position_landscape      Relative_Yield_avg 6.876179e-03
## 73       IQF_position_slope          ICR_N_perc_23A 5.971899e-02
## 74       IQF_position_slope           ICR_Org_C_avg 3.374605e-02
## 75       IQF_position_slope          ICR_K_perc_23A 4.105470e-02
## 76       IQF_position_slope         ICR_Avail_P_avg 6.969007e-03
## 77       IQF_position_slope     ICR_arable_land_avg 3.232044e-02
## 78       IQF_position_slope             ICF_Alt_avg 4.528037e-03
## 79       IQF_position_slope        ICR_rainfall_avg 2.145827e-02
## 80       IQF_position_slope         ICR_compost_avg 8.936901e-03
## 81       IQF_position_slope IQR_compost_quality_avg 3.055493e-02
## 82       IQF_position_slope  Comp_amount_corr_quali 1.621334e-02
## 83       IQF_position_slope       ICF_planting_date 1.896696e-02
## 84       IQF_position_slope      Relative_Yield_avg 4.246341e-03
## 85  Weed_species_A_combined          ICR_N_perc_23A 4.400381e-02
## 86  Weed_species_A_combined           ICR_Org_C_avg 3.009641e-02
## 87  Weed_species_A_combined          ICR_K_perc_23A 2.791154e-02
## 88  Weed_species_A_combined         ICR_Avail_P_avg 2.552407e-02
## 89  Weed_species_A_combined     ICR_arable_land_avg 5.092240e-02
## 90  Weed_species_A_combined             ICF_Alt_avg 6.075114e-02
## 91  Weed_species_A_combined        ICR_rainfall_avg 1.210518e-01
## 92  Weed_species_A_combined         ICR_compost_avg 7.929260e-02
## 93  Weed_species_A_combined IQR_compost_quality_avg 5.007714e-02
## 94  Weed_species_A_combined  Comp_amount_corr_quali 8.226801e-02
## 95  Weed_species_A_combined       ICF_planting_date 1.872962e-02
## 96  Weed_species_A_combined      Relative_Yield_avg 3.081797e-02
## 97              CA_typology          ICR_N_perc_23A 4.408642e-03
## 98              CA_typology           ICR_Org_C_avg 9.942649e-03
## 99              CA_typology          ICR_K_perc_23A 2.273806e-03
## 100             CA_typology         ICR_Avail_P_avg 9.462810e-03
## 101             CA_typology     ICR_arable_land_avg 5.609013e-03
## 102             CA_typology             ICF_Alt_avg 9.576356e-03
## 103             CA_typology        ICR_rainfall_avg 1.087797e-02
## 104             CA_typology         ICR_compost_avg 2.391797e-02
## 105             CA_typology IQR_compost_quality_avg 6.226495e-03
## 106             CA_typology  Comp_amount_corr_quali 3.517053e-02
## 107             CA_typology       ICF_planting_date 4.881061e-03
## 108             CA_typology      Relative_Yield_avg 5.826745e-01
## 109         IQF_environment          ICR_N_perc_23A 6.106932e-02
## 110         IQF_environment           ICR_Org_C_avg 4.249497e-02
## 111         IQF_environment          ICR_K_perc_23A 6.049796e-02
## 112         IQF_environment         ICR_Avail_P_avg 2.182301e-02
## 113         IQF_environment     ICR_arable_land_avg 5.931029e-02
## 114         IQF_environment             ICF_Alt_avg 6.713131e-01
## 115         IQF_environment        ICR_rainfall_avg 3.953755e-01
## 116         IQF_environment         ICR_compost_avg 1.789913e-01
## 117         IQF_environment IQR_compost_quality_avg 2.840683e-01
## 118         IQF_environment  Comp_amount_corr_quali 4.409255e-02
## 119         IQF_environment       ICF_planting_date 1.949575e-01
## 120         IQF_environment      Relative_Yield_avg 3.278407e-02
Eta² value Interpretation
0.00–0.01 Negligible effect
0.01–0.06 Small effect
0.06–0.14 Medium effect
> 0.14 Large effect

#Now we look at the drivers of performance dynamic Random forest for the output variable of average relative yield of CA vs CONV across the 6 seasons in each farm (Relative_yield_avg) with the list of potential drives as is

Prepare the dataset for Random Forest We explude N, altitude, position in the lanscape

Notes: Very high % Var explained: 95.7 makes us suspect something is wrong with the variable s(too many amd/or too many cathergores within each with few representations)

RF data

library(dplyr)
library(forcats)
library(randomForest)

# 1. Prepare data
rf_data_clean <- rf_data %>%
  mutate(
    across(where(is.character), as.factor),
    across(where(is.numeric), ~ ifelse(is.na(.x), median(.x, na.rm = TRUE), .x)),
    across(where(is.factor), ~ fct_explicit_na(.x, na_level = "Unknown"))
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.factor), ~fct_explicit_na(.x, na_level =
##   "Unknown"))`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
# 2. Fit Random Forest model
set.seed(123)
rf_model <- randomForest(
  Relative_Yield_avg ~ .,
  data = rf_data_clean,
  ntree = 1000,
  importance = TRUE
)

# 3. Show model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = Relative_Yield_avg ~ ., data = rf_data_clean,      ntree = 1000, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.001406036
##                     % Var explained: 95.82
# 4. Variable importance plot
varImpPlot(rf_model, type = 1)

#Dive into drivers of performance

Make regression trees with the key variables leaving 15% fo the blcoks from each environment out for independent validation

Evaluate the % of farmers to be targeted if using the regression tree and the % of farmers that were acceptable or good

Compare a blanquet approach as compared to the targeted one:

##N of famers reached ##roportion of reached farmers with success ##Total N of farmers reached that will success / ## Modeling dissemination of blanket vs. targeted Using data from dissemination rates (% of potential dissemination based on success of target farmer) model how the intervantion impact woudl evolve in one and another case (from Intercrop trial and or from here if available) Some kind of modeling with dissemination theory and how if a farmer gets positive feedback X chance of adoption if only negative Y and if possitive and negative Z….then after N years, targeting could lead to even higher adoption and reduced costs