#Libraries and Data upload

library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 11921 Columns: 94
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (54): IQR_Season, Seas_block_treat, IQR_SeasAB, Sample code, IQF_agzone,...
## dbl (39): ICF_Lat, ICF_Long, ICF_Alt, ICR_OM_perc, ICR_landprep_pers_usd_ha,...
## num  (1): DC_cost_usd_ha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)

ca_raw <- ca_raw %>%
  mutate(
    IQF_environment = case_when(
      IQF_environment %in% c("Mid Low Dry", "Mid_Low_Dry") ~ "Mid_Low_Dry",
      IQF_environment %in% c("Mid Low Wet", "Mid_Low_Wet") ~ "Mid_Low_Wet",
      TRUE ~ IQF_environment
    )
  )

Data pre-Cleaning

Some numeric columns are read as characters. We’ll define two cleaning functions: 1. convert_numeric_chars() - converts any character column to numeric if all values are numeric. 2. clean_numeric_columns() - replaces invalid strings (e.g. “na”, “NA”) and converts to numeric if most values are numbers. ## Numeric/factoruian check

convert_numeric_chars <- function(df) {
  df %>%
    mutate(across(where(is.character), ~ {
      if (all(grepl("^[-]?[0-9]*\\.?[0-9]+$", .x[!is.na(.x)]))) {
        as.numeric(.x)
      } else {
        .x
      }
    }))
}

# Apply it
ca_raw <- convert_numeric_chars(ca_raw)

# Function to convert character columns to numeric when appropriate
clean_numeric_columns <- function(df) {
  df <- df %>%
    mutate(across(where(is.character), function(x) {
      # Replace "NA", "na", "" with real NA
      x[x %in% c("NA", "na", "")] <- NA
      
      # Try converting to numeric
      x_num <- suppressWarnings(as.numeric(x))
      
      # If most non-NA values are numeric, keep it
      if (sum(!is.na(x_num)) > 0.9 * sum(!is.na(x))) {
        return(x_num)
      } else {
        return(x)
      }
    }))
  
  return(df)
}

# Apply it to ca_raw
ca_raw <- clean_numeric_columns(ca_raw)

# Check summary of converted classes
table(sapply(ca_raw, class))
## 
## character   numeric 
##        39        55

Explore how database is reduced if we only keep trat A and C

library(dplyr)
library(tidyr)

summary_farmers_raw <- ca_raw %>%
  filter(!is.na(IQR_Season), !is.na(IQR_block)) %>%
  group_by(IQR_Season) %>%
  summarise(
    # total farmers (no yield / treatment conditions)
    total_farmers = n_distinct(IQR_block),

    # farmers who have BOTH A and C with yield not NA and not 0
    farmers_with_A_and_C = n_distinct(IQR_block[
      Treat_code %in% c("A","C") &
        !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
        ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
            IQR_block, FUN = any) &
        ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="C",
            IQR_block, FUN = any)
    ]),

    # farmers who have A AND (B OR C) with yield not NA and not 0
    farmers_with_A_and_BorC = n_distinct(IQR_block[
      Treat_code %in% c("A","B","C") &
        !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
        ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
            IQR_block, FUN = any) &
        ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code %in% c("B","C"),
            IQR_block, FUN = any)
    ]),
    .groups = "drop"
  ) %>%
  mutate(
    farmers_with_A_and_C = replace_na(farmers_with_A_and_C, 0),
    farmers_with_A_and_BorC = replace_na(farmers_with_A_and_BorC, 0),
    pct_with_A_and_C = round(100 * farmers_with_A_and_C / total_farmers, 1),
    pct_with_A_and_BorC = round(100 * farmers_with_A_and_BorC / total_farmers, 1)
  )

summary_farmers_raw
## # A tibble: 6 × 6
##   IQR_Season total_farmers farmers_with_A_and_C farmers_with_A_and_BorC
##   <chr>              <int>                <int>                   <int>
## 1 23A                  580                  559                     559
## 2 23B                  579                  506                     507
## 3 24A                  495                  469                     471
## 4 24B                  477                  472                     472
## 5 25A                  457                  441                     442
## 6 25B                  449                  441                     442
## # ℹ 2 more variables: pct_with_A_and_C <dbl>, pct_with_A_and_BorC <dbl>

#Data cleaning ## Clean for only farm-season with A and C, and that they are not 0 or NA

library(dplyr)

ca_raw <- ca_raw %>%
  mutate(
    IQF_SeasonblockID = if_else(
      is.na(IQF_SeasonblockID),
      paste(IQR_block, IQR_Season, sep = "_"),
      IQF_SeasonblockID
    )
  )


ca_TATC <- ca_raw %>%
  # keep only A and C rows
  filter(Treat_code %in% c("A", "C")) %>%
  # group by the paired unit
  group_by(IQF_SeasonblockID) %>%
  # keep only blocks where BOTH A and C exist
  filter(all(c("A", "C") %in% Treat_code)) %>%
  # and BOTH have valid (non-NA, non-zero) yields
  filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0)) %>%
  ungroup()

ca_TATC 
## # A tibble: 5,776 × 94
##    IQR_Season Seas_block_treat IQR_SeasAB `Sample code`    IQF_agzone IQF_region
##    <chr>      <chr>            <chr>      <chr>            <chr>      <chr>     
##  1 23A        23A79_15C        A          23A_i_P2_79_15_T Mayaga-Bu… S         
##  2 23A        23A79_15A        A          23A_i_P2_79_15_T Mayaga-Bu… S         
##  3 23A        23A58_10A        A          23A_i_P2_58_10_T Central P… NW        
##  4 23A        23A58_10C        A          23A_i_P2_58_10_T Central P… NW        
##  5 23A        23A53_4C         A          23A_i_P2_53_4_T  Volcanic … NW        
##  6 23A        23A53_4A         A          23A_i_P2_53_4_T  Volcanic … NW        
##  7 23A        23A38_15A        A          23A_i_P2_38_15_T Buberuka … NE        
##  8 23A        23A38_15C        A          23A_i_P2_38_15_T Buberuka … NE        
##  9 23A        23A9_14A         A          23A_i_P2_9_14_T  Eastern R… E         
## 10 23A        23A9_14C         A          23A_i_P2_9_14_T  Eastern R… E         
## # ℹ 5,766 more rows
## # ℹ 88 more variables: IQF_District <chr>, IQF_environment <chr>,
## #   IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## #   IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## #   IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## #   IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## #   ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>, …
ca_TATC  %>%
  count(IQF_SeasonblockID)
## # A tibble: 2,888 × 2
##    IQF_SeasonblockID     n
##    <chr>             <int>
##  1 102_10_23A            2
##  2 102_10_23B            2
##  3 102_10_24A            2
##  4 102_10_24B            2
##  5 102_10_25A            2
##  6 102_10_25B            2
##  7 102_11_23A            2
##  8 102_11_23B            2
##  9 102_11_24A            2
## 10 102_11_24B            2
## # ℹ 2,878 more rows

Oulier detection

Prepare data

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(broom.mixed)
## Warning: package 'broom.mixed' was built under R version 4.3.3
library(dplyr)

model_data <- ca_TATC %>%
  filter(
    !is.na(DC_Yield_t_ha),
    DC_Yield_t_ha != 0,
    !is.na(Treat_code),
    !is.na(IQR_Season),
    !is.na(IQF_environment),
    !is.na(IQR_block)
  ) %>%
  mutate(
    DC_Yield_t_ha   = as.numeric(DC_Yield_t_ha),
    Treat_code      = factor(Treat_code),
    IQR_Season      = factor(IQR_Season),
    IQF_environment = factor(IQF_environment),
    IQR_block       = factor(IQR_block)
  )

###Linear mixed model for outlier detection

model_resid <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQR_Season +
    (1 | IQR_Season) +
    (1 | IQF_environment) +
    (1 | IQR_block),
  data = model_data,
  REML = TRUE
)

Extract residuals

resid_data <- augment(model_resid, data = model_data) %>%
  mutate(
    std_resid = as.numeric(scale(.resid)),
    is_outlier = abs(std_resid) > 3
  )

COunt outliers

outlier_counts <- resid_data %>%
  filter(is_outlier) %>%
  group_by(IQR_Season, Treat_code) %>%
  summarise(
    n_outliers = n(),
    .groups = "drop"
  )

print(outlier_counts)
## # A tibble: 6 × 3
##   IQR_Season Treat_code n_outliers
##   <fct>      <fct>           <int>
## 1 23A        A                  13
## 2 23A        C                   4
## 3 24A        A                  17
## 4 24A        C                   7
## 5 25A        A                   8
## 6 25A        C                   7

###Inspect extreme observations

resid_data %>%
  filter(is_outlier) %>%
  select(
    IQR_Season,
    Treat_code,
    IQR_block,
    IQF_environment,
    DC_Yield_t_ha,
    std_resid
  ) %>%
  arrange(desc(abs(std_resid)))
## # A tibble: 56 × 6
##    IQR_Season Treat_code IQR_block IQF_environment DC_Yield_t_ha std_resid
##    <fct>      <fct>      <fct>     <fct>                   <dbl>     <dbl>
##  1 24A        A          53_4      Highland                 18.1      8.22
##  2 24A        A          53_3      Highland                 19.4      8.14
##  3 24A        A          53_5      Highland                 16.9      7.85
##  4 23A        A          17_6      Mid_Low_Dry              16.2      7.27
##  5 24A        C          53_5      Highland                 14.9      7.19
##  6 24A        C          53_4      Highland                 15.4      7.00
##  7 24A        C          53_3      Highland                 15.7      6.07
##  8 23A        A          17_1      Mid_Low_Dry              16.7      6.07
##  9 24A        A          49_2      Highland                 14.4      5.98
## 10 24A        A          53_1      Highland                 14.1      5.15
## # ℹ 46 more rows

Visualization of ouliers

library(dplyr)
library(ggplot2)

# Prepare base plotting data
plot_data <- ca_TATC %>%
  filter(
    !is.na(DC_Yield_t_ha),
    DC_Yield_t_ha != 0,
    !is.na(Treat_code),
    !is.na(IQR_Season),
    !is.na(IQF_agzone)
  ) %>%
  mutate(
    Treat_code = factor(Treat_code),
    IQR_Season = factor(IQR_Season),
    IQF_agzone = factor(IQF_agzone)
  )

# Join LMM outlier flag
plot_data_lmm <- plot_data %>%
  left_join(
    resid_data %>%
      select(
        IQR_Season,
        Treat_code,
        IQR_block,
        is_outlier_lmm = is_outlier
      ),
    by = c("IQR_Season", "Treat_code", "IQR_block")
  ) %>%
  mutate(
    is_outlier_lmm = ifelse(is.na(is_outlier_lmm), FALSE, is_outlier_lmm)
  )
ggplot(plot_data_lmm, aes(x = Treat_code, y = DC_Yield_t_ha)) +
  geom_boxplot(outlier.shape = NA, fill = "gray90") +
  geom_jitter(
    aes(color = is_outlier_lmm),
    width = 0.2,
    alpha = 0.7
  ) +
  scale_color_manual(
    values = c("FALSE" = "black", "TRUE" = "red"),
    labels = c("FALSE" = "Regular observation", "TRUE" = "Model outlier")
  ) +
  facet_grid(IQF_agzone ~ IQR_Season, scales = "free_y") +
  labs(
    title = "Yield (t/ha) by Treatment and Season",
    subtitle = "Red points indicate outliers detected by linear mixed-effects model (|std. resid| > 4)",
    x = "Treatment Code",
    y = "Yield (t/ha)",
    color = ""
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.text = element_text(size = 9)
  )

Found some plots with yield <0.1 tn/ha explore

library(dplyr)

threshold <- 0.1

block_season_flags <- ca_TATC %>%
  filter(!is.na(IQF_SeasonblockID), !is.na(Treat_code)) %>%
  mutate(DC_Yield_t_ha = as.numeric(DC_Yield_t_ha)) %>%
  group_by(IQF_SeasonblockID, IQR_Season) %>%
  summarise(
    # both A and C exist AND both are < 0.1
    AC_both_lt0.1 =
      any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) &
      any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),

    # A OR B is < 0.1 (either one, if present)
    AorB_lt0.1 =
      any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) |
      any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),

    .groups = "drop"
  )

# ---- Overall counts (unique block-season combinations) ----
overall_counts <- block_season_flags %>%
  summarise(
    total_block_season = n(),
    n_AC_lt0.1 = sum(AC_both_lt0.1),
    n_AorB_lt0.1 = sum(AorB_lt0.1),
    prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
    prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3)
  )

overall_counts
## # A tibble: 1 × 5
##   total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1 prop_AorB_lt0.1
##                <int>      <int>        <int>         <dbl>           <dbl>
## 1               2888          1           14             0           0.005
# ---- By season (optional) ----
by_season_counts <- block_season_flags %>%
  group_by(IQR_Season) %>%
  summarise(
    total_block_season = n(),
    n_AC_lt0.1 = sum(AC_both_lt0.1),
    n_AorB_lt0.1 = sum(AorB_lt0.1),
    prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
    prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3),
    .groups = "drop"
  )

by_season_counts
## # A tibble: 6 × 6
##   IQR_Season total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1
##   <chr>                   <int>      <int>        <int>         <dbl>
## 1 23A                       559          0            0         0    
## 2 23B                       506          1            7         0.002
## 3 24A                       469          0            1         0    
## 4 24B                       472          0            6         0    
## 5 25A                       441          0            0         0    
## 6 25B                       441          0            0         0    
## # ℹ 1 more variable: prop_AorB_lt0.1 <dbl>

Final clan data = ca_AC_clean

clean databse with no NA or <0.1 for yield and cleaned removing outliers (bothe treatments for theat season-farmer it at lease one is outlier)

library(dplyr)

# --- 1) Identify model outliers (if you already have resid_data, skip to step 2) ---
# NOTE: requires lme4, lmerTest, broom.mixed loaded and model fitted as you did

# resid_data should contain: IQR_Season, Treat_code, IQR_block, IQF_environment, .resid, is_outlier
# (from: resid_data <- augment(model_resid, data = model_data) %>% mutate(std_resid=..., is_outlier=...))

# --- 2) Drop ALL outlier blocks (balanced A/C by SeasonblockID) ---
# Rule: if either A or C is flagged as outlier within a given IQF_SeasonblockID,
#       remove the entire IQF_SeasonblockID (both treatments), so pairs remain balanced.

outlier_ids <- resid_data %>%
  filter(is_outlier) %>%
  distinct(IQF_SeasonblockID)
## not outliers
ca_AC_clean <- ca_TATC %>%
  # keep only A and C (optional, but usually desired for paired A/C analysis)
  filter(Treat_code %in% c("A", "C")) %>%
  # keep only complete A/C pairs with valid yields (recommended)
  group_by(IQF_SeasonblockID) %>%
  filter(all(c("A","C") %in% Treat_code)) %>%
  filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha >= 0.1)) %>%
  ungroup() %>%
  # drop any SeasonblockID that had an outlier in either treatment
  anti_join(outlier_ids, by = "IQF_SeasonblockID")

# Quick checks
cat("Rows before:", nrow(ca_TATC), "\n")
## Rows before: 5776
cat("Rows after :", nrow(ca_AC_clean), "\n")
## Rows after : 5654
# Confirm balance: should be 2 rows (A and C) per SeasonblockID
balance_check <- ca_AC_clean %>%
  count(IQF_SeasonblockID) %>%
  count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
print(balance_check)
## # A tibble: 1 × 2
##       n    nn
##   <int> <int>
## 1     2  2827

Create variables of crop and year and block_season combination

library(tidyverse)

ca_AC_clean <- ca_AC_clean %>%
  mutate(
    year = str_sub(IQR_Season, 1, 2),
    crop_code = str_sub(IQR_Season, -1),
    crop = case_when(
      crop_code == "A" ~ "Maize",
      crop_code == "B" ~ "Beans",
      TRUE ~ NA_character_
    )
  )

ca_AC_clean$block_season = paste0(ca_AC_clean$IQR_block, "_", ca_AC_clean$IQR_Season)

#New database of column of yield ratio CONV_CA_Ratio

library(dplyr)

CONV_CA_Ratio <- ca_AC_clean %>%
  # Convert profit column
  mutate(
    DC_profit_usd_ha = as.numeric(gsub(",", "", trimws(DC_profit_usd_ha)))
  ) %>%
  
  # Keep only relevant treatments
  filter(Treat_code %in% c("A", "C")) %>%
  filter(!is.na(DC_Yield_t_ha), !is.na(DC_profit_usd_ha)) %>%
  
  # Group by block-season
  group_by(IQR_block, IQR_Season) %>%
  
  # NEW: keep only groups with both A and C after filtering
  filter(n_distinct(Treat_code) == 2) %>%
  
  # Mutate safely
  mutate(
    yield_A = DC_Yield_t_ha[Treat_code == "A"],
    yield_C = DC_Yield_t_ha[Treat_code == "C"],
    profit_A = DC_profit_usd_ha[Treat_code == "A"],
    profit_C = DC_profit_usd_ha[Treat_code == "C"],
    Y_ratio = yield_C / yield_A,
    Profit_ratio = profit_C / profit_A
  ) %>%
  
  # Keep only one row per group (e.g., the C row)
  filter(Treat_code == "C") %>%
  ungroup() %>%
  filter(is.finite(Y_ratio), is.finite(Profit_ratio))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DC_profit_usd_ha = as.numeric(gsub(",", "",
##   trimws(DC_profit_usd_ha)))`.
## Caused by warning:
## ! NAs introduced by coercion

Yield and profit comparions exploration

% COnv>CA per soason-environment_Yield

library(dplyr)
library(tidyr)

A_vs_C_prop <- ca_AC_clean %>%
  filter(Treat_code %in% c("A", "C")) %>%
  select(
    IQR_Season,
    IQF_environment,
    IQF_SeasonblockID,
    Treat_code,
    DC_Yield_t_ha
  ) %>%
  pivot_wider(
    names_from = Treat_code,
    values_from = DC_Yield_t_ha
  ) %>%
  filter(!is.na(A), !is.na(C)) %>%
  group_by(IQR_Season, IQF_environment) %>%
  summarise(
    n_blocks = n(),
    prop_A_greater_C = mean(A > C),
    prop_C_greater_A = mean(C > A),
    prop_equal = mean(A == C),
    .groups = "drop"
  )

A_vs_C_prop
## # A tibble: 24 × 6
##    IQR_Season IQF_environment n_blocks prop_A_greater_C prop_C_greater_A
##    <chr>      <chr>              <int>            <dbl>            <dbl>
##  1 23A        Highland             120            0.692           0.308 
##  2 23A        Mid_Low_Dry          173            0.728           0.266 
##  3 23A        Mid_Low_Wet          145            0.745           0.255 
##  4 23A        Transition           100            0.68            0.31  
##  5 23B        Highland             111            0.694           0.270 
##  6 23B        Mid_Low_Dry          160            0.906           0.0875
##  7 23B        Mid_Low_Wet          132            0.773           0.212 
##  8 23B        Transition            90            0.756           0.233 
##  9 24A        Highland             104            0.971           0.0288
## 10 24A        Mid_Low_Dry          146            0.753           0.233 
## # ℹ 14 more rows
## # ℹ 1 more variable: prop_equal <dbl>

##Visual distribution of CONV advantage vs CA_Yield

library(ggplot2)

ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
  geom_density(alpha = 0.3) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(title = "Density of Relative Yield (CA / CONV) by Agzone")

##Visual distribution of CONV advantage vs CA_profit

ggplot(CONV_CA_Ratio, aes(x = Profit_ratio, fill = IQF_environment)) +
  geom_density(alpha = 0.5, adjust = 1.2) +  # <-- smoother
   geom_vline(xintercept = 1, linetype = "dashed") +
  scale_fill_brewer(palette = "Pastel1") +
  coord_cartesian(xlim = c(-1, 6)) +
  labs(
    title = "Density of Relative Profit (CA / CONV) by Agzone",
    x = "Profit_ratio",
    y = "Density"
  ) +
  theme_minimal()

## Another way to see the disctribution of the advantage of CONV_Yield Overall about 22% of the block-season combinations had CA as a similar or better performed than CONV

library(dplyr)
library(ggplot2)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  arrange(Y_ratio) %>%
  filter(is.finite(Y_ratio)) %>%
  mutate(
    season_crop = case_when(
      IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
      IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
      TRUE ~ "Other"
    ),
    cum_percent = 100 * (row_number() / n())
  )

# 2. Estimate intersection y-value at x = 1 (Y_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
  group_by(IQF_environment) %>%
  filter(n() >= 5) %>%
  do({
    tryCatch({
      model <- loess(cum_percent ~ Y_ratio, data = .)
      y_at_1 <- predict(model, newdata = data.frame(Y_ratio = 1))
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
    }, error = function(e) {
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
    })
  }) %>%
  ungroup()

# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")

# 4. Plot
ggplot(plot_data, aes(x = Y_ratio, y = cum_percent)) +
  geom_point(aes(color = season_crop), size = 2) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
  geom_text(
    data = intersection_lines,
    aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
    inherit.aes = FALSE,
    color = "blue",
    size = 3,
    hjust = 0
  ) +
  scale_color_manual(
    values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
  ) +
  scale_x_continuous(limits = c(0, 2.5)) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap(~ IQF_environment, scales = "free_y") +
  labs(
    x = "Relative Yield (CA / CONV)",
    y = "Cumulative % of Observations",
    title = "Cumulative Frequency of Relative Yield at Block Level",
    subtitle = "Dashed = 1 yield threshold; Dotted = intersect with cumulative curve",
    color = "Season Crop Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 10, hjust = 0.5),
    strip.text = element_text(face = "bold"),
    legend.position = "right"
  )
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).

##Same distribution but for profit IN terms of profit, the advantage of CONV is a little larger

library(dplyr)
library(ggplot2)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  arrange(Profit_ratio) %>%
  filter(is.finite(Profit_ratio)) %>%
  mutate(
    season_crop = case_when(
      IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
      IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
      TRUE ~ "Other"
    ),
    cum_percent = 100 * (row_number() / n())
  )

# 2. Estimate intersection y-value at x = 1 (Profit_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
  group_by(IQF_environment) %>%
  filter(n() >= 5) %>%
  do({
    tryCatch({
      model <- loess(cum_percent ~ Profit_ratio, data = .)
      y_at_1 <- predict(model, newdata = data.frame(Profit_ratio = 1))
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
    }, error = function(e) {
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
    })
  }) %>%
  ungroup()

# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")

# 4. Plot
ggplot(plot_data, aes(x = Profit_ratio, y = cum_percent)) +
  geom_point(aes(color = season_crop), size = 2, alpha = 0.6) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
  geom_text(
    data = intersection_lines,
    aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
    inherit.aes = FALSE,
    color = "blue",
    size = 3,
    hjust = 0
  ) +
  scale_color_manual(
    values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
  ) +
  scale_x_continuous(limits = c(-2, 2.5)) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap(~ IQF_environment, scales = "free_y") +
  labs(
    x = "Relative Profit (CA / CONV)",
    y = "Cumulative % of Observations",
    title = "Cumulative Frequency of Relative Profit at Block Level",
    subtitle = "Dashed = 1 profit threshold; Dotted = intersect with cumulative curve",
    color = "Season Crop Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 10, hjust = 0.5),
    strip.text = element_text(face = "bold"),
    legend.position = "right"
  )
## Warning: Removed 119 rows containing missing values or values outside the scale range
## (`geom_point()`).

## What is the relation among performance in Yield and Performance in Profit separating by crop?

library(dplyr)
library(ggplot2)

# 1. Prepare filtered data
regression_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    is.finite(Profit_ratio),
    Profit_ratio > -5, Profit_ratio < 5,
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  )

# 2. Create plot with regression line and 1:1 reference line
ggplot(regression_data, aes(x = Y_ratio, y = Profit_ratio)) +
  geom_point(alpha = 0.4, color = "gray40", size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "blue", linewidth = 0.8) +
  facet_wrap(~crop_type) +
  labs(
    title = "Relationship Between Yield Ratio and Profit Ratio by Crop",
    x = "Relative Yield (Y_ratio: CA / CONV)",
    y = "Relative Profit (Profit_ratio: CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )
## `geom_smooth()` using formula = 'y ~ x'

#How does the performance of CA is affected by the environmental index (yield of the control?) ## First we construct an environment index This is the relative yield of each of the treatments in a block as compared to all the blocks of that treatment in the same season and environment. Theh we average the relative yield of all the treatments in a block for a season and we have the environmental indeal of it expresed as a ratio where if > 1 then the environment as expresed by the crops performance was good Notes: As a whole, the relation is weak, however, yield performance of CA (CA:CONV ratio) start to see more very bad scores for relatively poor environments (EI<1)

library(dplyr)

# Step 1: Make sure relevant columns exist
df <- ca_AC_clean %>%
  filter(!is.na(DC_Yield_t_ha), !is.na(Treat_code), !is.na(block_season), !is.na(IQR_Season), !is.na(IQF_environment))

# Step 2: Compute average yield per Treat_code within IQR_Season + IQF_environment
ref_yields <- df %>%
  group_by(Treat_code, IQR_Season, IQF_environment) %>%
  summarise(avg_env_yield = mean(DC_Yield_t_ha, na.rm = TRUE), .groups = "drop")

# Step 3: Join reference yields back to main data
df_with_ref <- df %>%
  left_join(ref_yields, by = c("Treat_code", "IQR_Season", "IQF_environment")) %>%
  mutate(
    yield_ratio_env = DC_Yield_t_ha / avg_env_yield
  )

# Step 4: Compute average of those ratios per block-season
env_index_df <- df_with_ref %>%
  group_by(block_season) %>%
  summarise(env_index = mean(yield_ratio_env, na.rm = TRUE), .groups = "drop")

# Step 5: View result
head(env_index_df)
## # A tibble: 6 × 2
##   block_season env_index
##   <chr>            <dbl>
## 1 102_10_23A       0.721
## 2 102_10_23B       0.816
## 3 102_10_24A       1.12 
## 4 102_10_24B       0.862
## 5 102_10_25A       1.26 
## 6 102_10_25B       1.18
# Then we paste it into CONV_CA_Ratio
# Merge environment index into CONV_CA_Ratio by block_season
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  left_join(env_index_df, by = "block_season")

# View result
head(CONV_CA_Ratio)
## # A tibble: 6 × 105
##   IQR_Season Seas_block_treat IQR_SeasAB `Sample code`    IQF_agzone  IQF_region
##   <chr>      <chr>            <chr>      <chr>            <chr>       <chr>     
## 1 23A        23A79_15C        A          23A_i_P2_79_15_T Mayaga-Bug… S         
## 2 23A        23A58_10C        A          23A_i_P2_58_10_T Central Pl… NW        
## 3 23A        23A53_4C         A          23A_i_P2_53_4_T  Volcanic c… NW        
## 4 23A        23A38_15C        A          23A_i_P2_38_15_T Buberuka h… NE        
## 5 23A        23A9_14C         A          23A_i_P2_9_14_T  Eastern Ri… E         
## 6 23A        23A32_3C         A          23A_i_P2_32_3_T  Eastern Sa… NE        
## # ℹ 99 more variables: IQF_District <chr>, IQF_environment <chr>,
## #   IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## #   IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## #   IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## #   IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## #   ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>,
## #   IQR_soil_texture <chr>, IQR_soil_color <chr>, …

Now we evaluate the relation among EI and evonomic and yield avdantage of CA..for amize and beans separated

library(dplyr)
library(ggplot2)
library(ggpmisc)  # for equation and p-value annotation
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## 
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Registered S3 method overwritten by 'ggpmisc':
##   method                  from   
##   as.character.polynomial polynom
# 1. Prepare data (only Yield Ratio, up to 2.5)
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Plot with regression + equation
ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
  stat_poly_eq(
    aes(label = paste(..eq.label.., ..p.value.label.., sep = "~~~")),
    formula = y ~ x,
    parse = TRUE,
    label.x = "left", label.y = "top",
    size = 3.5
  ) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Relationship Between Environment Index and Yield Ratio (CA / CONV)",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )
## Warning: The dot-dot notation (`..eq.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(eq.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Segmented linear model

# Load required packages
library(dplyr)
library(ggplot2)
library(segmented)
## Warning: package 'segmented' was built under R version 4.3.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
## 
##     lmList
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(glue)
## Warning: package 'glue' was built under R version 4.3.3
library(purrr)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Fit piecewise regression per crop, extract slopes and p-values safely
segmented_results <- plot_data %>%
  group_split(crop) %>%
  map_df(function(df) {
    crop_name <- unique(df$crop)

    # Fit linear model
    lm_fit <- lm(Y_ratio ~ env_index, data = df)

    # Try segmented model
    seg_fit <- tryCatch(
      segmented(lm_fit, seg.Z = ~ env_index, psi = median(df$env_index)),
      error = function(e) NULL
    )

    if (!is.null(seg_fit)) {
      df$fit <- fitted(seg_fit)
      df$breakpoint <- seg_fit$psi[2]

      # Try to extract slopes
      slopes_df <- tryCatch(slope(seg_fit)$env_index, error = function(e) NULL)

      if (!is.null(slopes_df) && nrow(slopes_df) >= 2) {
        slope1 <- slopes_df[1, "Est."]
        slope2 <- slopes_df[2, "Est."]

        # Check if "p" column exists before accessing
        if ("p" %in% colnames(slopes_df)) {
          pval1 <- slopes_df[1, "p"]
          pval2 <- slopes_df[2, "p"]
          df$slope_label <- glue("Slope1 = {round(slope1, 3)} (p = {format.pval(pval1, digits = 2)})\n",
                                 "Slope2 = {round(slope2, 3)} (p = {format.pval(pval2, digits = 2)})")
        } else {
          df$slope_label <- glue("Slope1 = {round(slope1, 3)}\nSlope2 = {round(slope2, 3)}")
        }

      } else {
        df$slope_label <- "Slopes not available"
      }

    } else {
      df$fit <- NA
      df$breakpoint <- NA
      df$slope_label <- "Model failed"
    }

    df
  })

# 3. Plot with segmented line and slope annotations
ggplot(segmented_results, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_line(aes(y = fit), color = "darkred", linewidth = 1) +
  geom_text(
    data = segmented_results %>%
      group_by(crop) %>%
      slice(1),  # only show one label per facet
    aes(x = -Inf, y = Inf, label = slope_label),
    hjust = -0.05, vjust = 1.1,
    inherit.aes = FALSE,
    size = 3.5,
    fontface = "italic"
  ) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Piecewise Linear Relationship Between Env Index and Yield Ratio",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

##Polinomial?

library(dplyr)
library(ggplot2)
library(broom)
library(glue)

# 1. Prepare data
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
  group_by(crop) %>%
  group_map(~ {
    df <- .x
    crop_label <- .y$crop

    models <- list(
      Linear = lm(Y_ratio ~ env_index, data = df),
      Quadratic = lm(Y_ratio ~ poly(env_index, 2), data = df),
      Cubic = lm(Y_ratio ~ poly(env_index, 3), data = df),
      LOESS = loess(Y_ratio ~ env_index, data = df)
    )

    r2_vals <- sapply(models, function(m) {
      s <- tryCatch(summary(m), error = function(e) NULL)
      if (!is.null(s$r.squared)) s$r.squared else NA_real_
    })

    tibble(
      crop = crop_label,
      model = names(models),
      r2 = as.numeric(r2_vals)
    )
  }) %>%
  bind_rows() %>%
  mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))

# 3. Create the base plot
p <- ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Model Shape Comparison: Yield Ratio vs Environment Index",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

# 4. Add all smooth lines
p <- p +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
  geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")

# 5. Position for R² text
r2_data <- r2_data %>%
  group_by(crop) %>%
  mutate(
    y = 2.5 - 0.2 * (row_number() - 1),
    x = min(plot_data$env_index, na.rm = TRUE) + 0.05
  )

# 6. Add R² labels to plot
p + geom_text(
  data = r2_data,
  aes(x = x, y = y, label = r2_label),
  inherit.aes = FALSE,
  hjust = 0,
  size = 3.5,
  fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

##Now with profit

library(dplyr)
library(ggplot2)
library(broom)
library(glue)

# 1. Prepare data (limit extreme profit ratios)
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Profit_ratio),
    is.finite(env_index),
    Profit_ratio <= 5,  # optional: limit outliers
    Profit_ratio >= -5
  )

# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
  group_by(crop) %>%
  group_map(~ {
    df <- .x
    crop_label <- .y$crop

    models <- list(
      Linear = lm(Profit_ratio ~ env_index, data = df),
      Quadratic = lm(Profit_ratio ~ poly(env_index, 2), data = df),
      Cubic = lm(Profit_ratio ~ poly(env_index, 3), data = df),
      LOESS = loess(Profit_ratio ~ env_index, data = df)
    )

    r2_vals <- sapply(models, function(m) {
      s <- tryCatch(summary(m), error = function(e) NULL)
      if (!is.null(s$r.squared)) s$r.squared else NA_real_
    })

    tibble(
      crop = crop_label,
      model = names(models),
      r2 = as.numeric(r2_vals)
    )
  }) %>%
  bind_rows() %>%
  mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))

# 3. Create base plot
p <- ggplot(plot_data, aes(x = env_index, y = Profit_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(-5, 5)) +
  labs(
    title = "Model Shape Comparison: Profit Ratio vs Environment Index",
    x = "Environment Index",
    y = "Profit Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

# 4. Add all smooth trendlines
p <- p +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
  geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")

# 5. Position R² labels
r2_data <- r2_data %>%
  group_by(crop) %>%
  mutate(
    y = 5 - 0.4 * (row_number() - 1),
    x = min(plot_data$env_index, na.rm = TRUE) + 0.05
  )

# 6. Add R² labels
p + geom_text(
  data = r2_data,
  aes(x = x, y = y, label = r2_label),
  inherit.aes = FALSE,
  hjust = 0,
  size = 3.5,
  fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Does the % wins of CA increse progresibely from season 1 to 6? Separated by crop

NOtes: NO, it seems that there is actually not a general trend for yield imperovement (for each crops separately) along the 3 years. However, there is yes plots where there is a positive trend; more for beans than for maize. This may be related to the fact that initially the cover crop did lots of competition with the beans and was corrected a little each season to reduce it..???

library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)

# -----------------------------
# 1. Prepare data
# -----------------------------
trend_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    year %in% c("23", "24", "25"),
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    year = as.numeric(year),
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  )

# -----------------------------
# 2. Function to compute slope counts
# -----------------------------
get_slope_label <- function(df) {
  # One slope per block
  slopes <- df %>%
    group_by(IQR_block) %>%
    filter(n_distinct(year) >= 2) %>%
    summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
    mutate(
      direction = case_when(
        slope > 0 ~ "↑ Positive",
        slope < 0 ~ "↓ Negative",
        TRUE ~ "= Flat"
      )
    ) %>%
    count(direction) %>%
    complete(direction, fill = list(n = 0))

  # Format label text
  paste0(
    "↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
    "↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
    "= Flat: ", slopes$n[slopes$direction == "= Flat"]
  )
}


# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))

# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)

# -----------------------------
# 3. MAIZE plot
# -----------------------------
p_maize <- ggplot(
  trend_data %>% filter(crop_type == "Maize"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "steelblue",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_maize,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Block-level and Overall Linear Trends of Y_ratio (Maize)",
    x = "Year",
    y = "Relative Yield (Y_ratio)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_maize,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 4. BEANS plot
# -----------------------------
p_beans <- ggplot(
  trend_data %>% filter(crop_type == "Beans"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "darkorange",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_beans,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Block-level and Overall Linear Trends of Y_ratio (Beans)",
    x = "Year",
    y = "Relative Yield (Y_ratio)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_beans,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 5. Print plots
# -----------------------------
print(p_maize)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(p_beans)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## For a farm, the trend for maize is same as for beans? NOte: There is not a clear relation among the trent in yield for maize and beans (that is the trend of yield along seasons is almost independent among crops for the same farm)

library(dplyr)
library(ggplot2)
library(tidyr)

# STEP 1: Calculate slopes per crop per block
slope_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    year %in% c("23", "24", "25"),
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    year = as.numeric(year),
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  ) %>%
  group_by(IQR_block, crop_type) %>%
  filter(n_distinct(year) >= 2) %>%
  summarise(
    slope = coef(lm(Y_ratio ~ year))[2],
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = crop_type,
    values_from = slope
  ) %>%
  filter(!is.na(Maize) & !is.na(Beans))

# STEP 2: Classify quadrants
slope_data <- slope_data %>%
  mutate(
    quadrant = case_when(
      Maize > 0 & Beans > 0 ~ "Q1: ↑ Maize & ↑ Beans",
      Maize < 0 & Beans > 0 ~ "Q2: ↓ Maize & ↑ Beans",
      Maize < 0 & Beans < 0 ~ "Q3: ↓ Maize & ↓ Beans",
      Maize > 0 & Beans < 0 ~ "Q4: ↑ Maize & ↓ Beans",
      TRUE ~ "Other"
    )
  )

# STEP 3: Count blocks per quadrant
quad_counts <- slope_data %>%
  count(quadrant) %>%
  filter(quadrant != "Other")

# STEP 4: Coordinates for text labels per quadrant
label_positions <- tibble::tibble(
  quadrant = c("Q1: ↑ Maize & ↑ Beans", "Q2: ↓ Maize & ↑ Beans",
               "Q3: ↓ Maize & ↓ Beans", "Q4: ↑ Maize & ↓ Beans"),
  x = c( max(slope_data$Maize, na.rm = TRUE) * 0.85,
         min(slope_data$Maize, na.rm = TRUE) * 0.85,
         min(slope_data$Maize, na.rm = TRUE) * 0.85,
         max(slope_data$Maize, na.rm = TRUE) * 0.85),
  y = c( max(slope_data$Beans, na.rm = TRUE) * 0.85,
         max(slope_data$Beans, na.rm = TRUE) * 0.85,
         min(slope_data$Beans, na.rm = TRUE) * 0.85,
         min(slope_data$Beans, na.rm = TRUE) * 0.85)
) %>%
  left_join(quad_counts, by = "quadrant") %>%
  mutate(label = paste0(quadrant, "\nN = ", n))


# STEP 5: Final plot
ggplot(slope_data, aes(x = Maize, y = Beans)) +
  geom_point(alpha = 0.7, size = 2.5, color = "purple") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted") +
  geom_text(data = label_positions,
            aes(x = x, y = y, label = label),
            size = 4.2, hjust = 0.5, vjust = 0.5, fontface = "bold") +
  labs(
    title = "Comparison of Yield Trends (Slopes) per Block",
    subtitle = "Each point is one IQR_block — comparing Maize vs Beans trend slopes",
    x = "Maize trend slope",
    y = "Beans trend slope"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

Mix models to evaluate treatment effect in absolute yield values, the interactions and the destribution of the residuals

Mix model and interactions

Note: Effectes of the 3 factors are significant and also the interactions. So the CA vs. CONV comparison needs to be opened and evaluated by season-environment combination

fit = lmer(DC_Yield_t_ha~IQR_Season + IQF_environment + Treat_code +
             IQR_Season:IQF_environment +  IQF_environment:Treat_code + IQR_Season:Treat_code+
              IQR_Season:IQF_environment:Treat_code+
             (1|IQR_block),
           data = ca_AC_clean)


summary(fit)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## DC_Yield_t_ha ~ IQR_Season + IQF_environment + Treat_code + IQR_Season:IQF_environment +  
##     IQF_environment:Treat_code + IQR_Season:Treat_code + IQR_Season:IQF_environment:Treat_code +  
##     (1 | IQR_block)
##    Data: ca_AC_clean
## 
## REML criterion at convergence: 18145.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1423 -0.5809 -0.0357  0.5168  3.8186 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  IQR_block (Intercept) 0.7352   0.8574  
##  Residual              1.1814   1.0869  
## Number of obs: 5654, groups:  IQR_block, 567
## 
## Fixed effects:
##                                                        Estimate Std. Error
## (Intercept)                                             5.07265    0.11767
## IQR_Season23B                                          -3.89347    0.14310
## IQR_Season24A                                          -0.57716    0.14608
## IQR_Season24B                                          -3.43020    0.14315
## IQR_Season25A                                          -1.31273    0.30002
## IQR_Season25B                                          -3.65663    0.14737
## IQF_environmentMid_Low_Dry                              0.50789    0.14817
## IQF_environmentMid_Low_Wet                             -0.57116    0.15895
## IQF_environmentTransition                              -0.33084    0.17251
## Treat_codeC                                            -0.58904    0.13926
## IQR_Season23B:IQF_environmentMid_Low_Dry                0.31837    0.18685
## IQR_Season24A:IQF_environmentMid_Low_Dry                0.92759    0.19113
## IQR_Season24B:IQF_environmentMid_Low_Dry               -0.41498    0.18889
## IQR_Season25A:IQF_environmentMid_Low_Dry               -0.28488    0.32428
## IQR_Season25B:IQF_environmentMid_Low_Dry                0.15796    0.19279
## IQR_Season23B:IQF_environmentMid_Low_Wet                0.39614    0.19425
## IQR_Season24A:IQF_environmentMid_Low_Wet                0.28734    0.19991
## IQR_Season24B:IQF_environmentMid_Low_Wet               -0.02848    0.19666
## IQR_Season25A:IQF_environmentMid_Low_Wet                1.18206    0.33027
## IQR_Season25B:IQF_environmentMid_Low_Wet                0.26227    0.20164
## IQR_Season23B:IQF_environmentTransition                 0.75971    0.21271
## IQR_Season24A:IQF_environmentTransition                 0.34742    0.21956
## IQR_Season24B:IQF_environmentTransition                 0.67783    0.21652
## IQR_Season25A:IQF_environmentTransition                 1.17576    0.33871
## IQR_Season25B:IQF_environmentTransition                 0.75964    0.22011
## IQF_environmentMid_Low_Dry:Treat_codeC                 -0.12676    0.18153
## IQF_environmentMid_Low_Wet:Treat_codeC                  0.02037    0.18881
## IQF_environmentTransition:Treat_codeC                   0.10068    0.20647
## IQR_Season23B:Treat_codeC                               0.37611    0.20078
## IQR_Season24A:Treat_codeC                              -1.23510    0.20469
## IQR_Season24B:Treat_codeC                              -0.06783    0.20058
## IQR_Season25A:Treat_codeC                              -0.15221    0.40874
## IQR_Season25B:Treat_codeC                               0.29254    0.20575
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC   -0.24892    0.26193
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC    1.01731    0.26749
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC    0.35035    0.26445
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC   -0.44379    0.43993
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC    0.04390    0.26910
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC   -0.07430    0.27266
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC    1.00901    0.28010
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC    0.19780    0.27563
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC   -0.16033    0.44962
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC    0.06295    0.28196
## IQR_Season23B:IQF_environmentTransition:Treat_codeC    -0.19715    0.29854
## IQR_Season24A:IQF_environmentTransition:Treat_codeC     0.75924    0.30740
## IQR_Season24B:IQF_environmentTransition:Treat_codeC     0.04022    0.30248
## IQR_Season25A:IQF_environmentTransition:Treat_codeC    -0.27436    0.46167
## IQR_Season25B:IQF_environmentTransition:Treat_codeC    -0.17918    0.30722
##                                                              df t value
## (Intercept)                                          3946.84230  43.108
## IQR_Season23B                                        5007.77869 -27.209
## IQR_Season24A                                        5014.15153  -3.951
## IQR_Season24B                                        5029.94067 -23.963
## IQR_Season25A                                        5095.20364  -4.375
## IQR_Season25B                                        5036.77759 -24.812
## IQF_environmentMid_Low_Dry                           5265.71529   3.428
## IQF_environmentMid_Low_Wet                           4244.04672  -3.593
## IQF_environmentTransition                            4466.82592  -1.918
## Treat_codeC                                          4934.68676  -4.230
## IQR_Season23B:IQF_environmentMid_Low_Dry             5024.75069   1.704
## IQR_Season24A:IQF_environmentMid_Low_Dry             5028.40349   4.853
## IQR_Season24B:IQF_environmentMid_Low_Dry             5038.70327  -2.197
## IQR_Season25A:IQF_environmentMid_Low_Dry             5152.64948  -0.879
## IQR_Season25B:IQF_environmentMid_Low_Dry             5044.03679   0.819
## IQR_Season23B:IQF_environmentMid_Low_Wet             5004.70035   2.039
## IQR_Season24A:IQF_environmentMid_Low_Wet             5014.82054   1.437
## IQR_Season24B:IQF_environmentMid_Low_Wet             5023.55760  -0.145
## IQR_Season25A:IQF_environmentMid_Low_Wet             5151.71514   3.579
## IQR_Season25B:IQF_environmentMid_Low_Wet             5029.47221   1.301
## IQR_Season23B:IQF_environmentTransition              5024.22535   3.572
## IQR_Season24A:IQF_environmentTransition              5036.14781   1.582
## IQR_Season24B:IQF_environmentTransition              5048.34564   3.131
## IQR_Season25A:IQF_environmentTransition              5140.32873   3.471
## IQR_Season25B:IQF_environmentTransition              5049.16391   3.451
## IQF_environmentMid_Low_Dry:Treat_codeC               4934.73832  -0.698
## IQF_environmentMid_Low_Wet:Treat_codeC               4934.43099   0.108
## IQF_environmentTransition:Treat_codeC                4936.59168   0.488
## IQR_Season23B:Treat_codeC                            4934.54897   1.873
## IQR_Season24A:Treat_codeC                            4933.26541  -6.034
## IQR_Season24B:Treat_codeC                            4934.27215  -0.338
## IQR_Season25A:Treat_codeC                            4932.21882  -0.372
## IQR_Season25B:Treat_codeC                            4933.17413   1.422
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 4934.36696  -0.950
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 4933.62520   3.803
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.92198   1.325
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 4932.37895  -1.009
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.18903   0.163
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.13254  -0.272
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 4933.57546   3.602
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.53452   0.718
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 4932.40823  -0.357
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 4933.03230   0.223
## IQR_Season23B:IQF_environmentTransition:Treat_codeC  4936.75531  -0.660
## IQR_Season24A:IQF_environmentTransition:Treat_codeC  4934.11571   2.470
## IQR_Season24B:IQF_environmentTransition:Treat_codeC  4935.72560   0.133
## IQR_Season25A:IQF_environmentTransition:Treat_codeC  4932.90235  -0.594
## IQR_Season25B:IQF_environmentTransition:Treat_codeC  4934.01727  -0.583
##                                                      Pr(>|t|)    
## (Intercept)                                           < 2e-16 ***
## IQR_Season23B                                         < 2e-16 ***
## IQR_Season24A                                        7.89e-05 ***
## IQR_Season24B                                         < 2e-16 ***
## IQR_Season25A                                        1.24e-05 ***
## IQR_Season25B                                         < 2e-16 ***
## IQF_environmentMid_Low_Dry                           0.000613 ***
## IQF_environmentMid_Low_Wet                           0.000330 ***
## IQF_environmentTransition                            0.055199 .  
## Treat_codeC                                          2.38e-05 ***
## IQR_Season23B:IQF_environmentMid_Low_Dry             0.088461 .  
## IQR_Season24A:IQF_environmentMid_Low_Dry             1.25e-06 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry             0.028071 *  
## IQR_Season25A:IQF_environmentMid_Low_Dry             0.379705    
## IQR_Season25B:IQF_environmentMid_Low_Dry             0.412614    
## IQR_Season23B:IQF_environmentMid_Low_Wet             0.041471 *  
## IQR_Season24A:IQF_environmentMid_Low_Wet             0.150693    
## IQR_Season24B:IQF_environmentMid_Low_Wet             0.884848    
## IQR_Season25A:IQF_environmentMid_Low_Wet             0.000348 ***
## IQR_Season25B:IQF_environmentMid_Low_Wet             0.193442    
## IQR_Season23B:IQF_environmentTransition              0.000358 ***
## IQR_Season24A:IQF_environmentTransition              0.113631    
## IQR_Season24B:IQF_environmentTransition              0.001754 ** 
## IQR_Season25A:IQF_environmentTransition              0.000522 ***
## IQR_Season25B:IQF_environmentTransition              0.000563 ***
## IQF_environmentMid_Low_Dry:Treat_codeC               0.485007    
## IQF_environmentMid_Low_Wet:Treat_codeC               0.914091    
## IQF_environmentTransition:Treat_codeC                0.625834    
## IQR_Season23B:Treat_codeC                            0.061097 .  
## IQR_Season24A:Treat_codeC                            1.72e-09 ***
## IQR_Season24B:Treat_codeC                            0.735259    
## IQR_Season25A:Treat_codeC                            0.709621    
## IQR_Season25B:Treat_codeC                            0.155149    
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 0.341986    
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 0.000145 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.185282    
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 0.313132    
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.870420    
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 0.785259    
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 0.000319 ***
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.473017    
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 0.721415    
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.823344    
## IQR_Season23B:IQF_environmentTransition:Treat_codeC  0.509028    
## IQR_Season24A:IQF_environmentTransition:Treat_codeC  0.013548 *  
## IQR_Season24B:IQF_environmentTransition:Treat_codeC  0.894235    
## IQR_Season25A:IQF_environmentTransition:Treat_codeC  0.552351    
## IQR_Season25B:IQF_environmentTransition:Treat_codeC  0.559773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 48 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(fit)
## Type III Analysis of Variance Table with Satterthwaite's method
##                                        Sum Sq Mean Sq NumDF  DenDF   F value
## IQR_Season                            10906.4 2181.29     5 5090.1 1846.3812
## IQF_environment                         144.2   48.06     3 1584.7   40.6770
## Treat_code                              456.2  456.21     1 4932.1  386.1684
## IQR_Season:IQF_environment              572.7   38.18    15 5134.1   32.3174
## IQF_environment:Treat_code               10.0    3.32     3 4935.8    2.8144
## IQR_Season:Treat_code                   106.8   21.36     5 4932.0   18.0778
## IQR_Season:IQF_environment:Treat_code    42.0    2.80    15 4933.6    2.3681
##                                          Pr(>F)    
## IQR_Season                            < 2.2e-16 ***
## IQF_environment                       < 2.2e-16 ***
## Treat_code                            < 2.2e-16 ***
## IQR_Season:IQF_environment            < 2.2e-16 ***
## IQF_environment:Treat_code             0.037797 *  
## IQR_Season:Treat_code                 < 2.2e-16 ***
## IQR_Season:IQF_environment:Treat_code  0.002128 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visual of CA:CONV be season-environemnt convination

# Load required libraries
library(dplyr)
library(ggplot2)

# 1. Filter and prepare main data
plot_data <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio), Y_ratio <= 5)

# 2. Compute p-values, significance, and means
sig_labels <- plot_data %>%
  group_by(IQR_Season, IQF_environment, crop) %>%
  summarise(
    mean_y = mean(Y_ratio, na.rm = TRUE),
    p_value = tryCatch(t.test(Y_ratio, mu = 1)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  mutate(
    signif = case_when(
      is.na(p_value) ~ "",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    mean_label = round(mean_y, 2)  # Rounded mean
  )

# 3. Get max Y for each group to place stars
y_pos <- plot_data %>%
  group_by(IQR_Season, IQF_environment, crop) %>%
  summarise(y_max = max(Y_ratio, na.rm = TRUE), .groups = "drop")

# 4. Merge label + position
sig_labels <- left_join(sig_labels, y_pos, by = c("IQR_Season", "IQF_environment", "crop"))

# 5. Plot
ggplot(plot_data, aes(x = IQR_Season, y = Y_ratio, color = crop)) +
  geom_boxplot(outlier.alpha = 0.3) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
  facet_wrap(~ IQF_environment) +

  # Add significance stars
  geom_text(
    data = sig_labels,
    aes(x = IQR_Season, y = y_max + 0.15, label = signif, group = crop),
    inherit.aes = FALSE,
    size = 5,
    fontface = "bold"
  ) +

  # Add mean values just above the stars
  geom_text(
    data = sig_labels,
    aes(x = IQR_Season, y = y_max + 0.7, label = mean_label, group = crop),
    inherit.aes = FALSE,
    size = 3.5,
    fontface = "plain",
    color = "black"
  ) +

  labs(
    title = "Yield Ratio (CA / CONV) by Season and Environment",
    x = "Season",
    y = "Yield Ratio"
  ) +
  ylim(0, 5.7) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

### Distribution of the residuals from mix model Although Shapiro–Wilk tests indicated statistically significant deviations from normality (p < 0.001), that my be because of the large sample size; W=0.99 the residuals are close to normality and Q–Q plots showed no substantial departures from normality, indicating that model assumptions were adequately met.

library(broom.mixed)
library(dplyr)

resid_df <- augment(fit) %>%
  mutate(
    resid = .resid,
    fitted = .fitted
  )
library(ggplot2)

ggplot(resid_df, aes(x = resid)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40,
                 fill = "grey80",
                 color = "black") +
  geom_density(color = "blue", linewidth = 1) +
  labs(
    title = "Distribution of residuals",
    x = "Residuals",
    y = "Density"
  ) +
  theme_minimal()

ggplot(resid_df, aes(sample = resid)) +
  stat_qq(color = "black") +
  stat_qq_line(color = "red", linewidth = 1) +
  labs(
    title = "Q–Q plot of residuals"
  ) +
  theme_minimal()

ggplot(resid_df, aes(x = fitted, y = resid)) +
  geom_point(alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs fitted values",
    x = "Fitted values",
    y = "Residuals"
  ) +
  theme_minimal()

### Test of residuals H0 rejected but because we have large N, the W is actually very close to 1…so its quite normal

set.seed(123)
shapiro.test(sample(resid_df$resid, 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid_df$resid, 5000)
## W = 0.98611, p-value < 2.2e-16

Random effects normality

Mean is 0 and symetric distribution of the residual of the random effects

qqnorm(ranef(fit)$IQR_block[[1]])
qqline(ranef(fit)$IQR_block[[1]], col = "red")

Typology of Yield Performance

Typology of multi-season CA performance (23A–25B)

To describe how farms perform under CA over time, we created a typology of block-level responses using their relative yield (CA / CONV) across six seasons (23A–25B). Only blocks with data in at least four seasons were retained to ensure stable classification.

For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85

These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.

Notes: Average performance of CA is very bad, even if we target the environmetns or the districts with best performance, this blanket reccomednation would fail baddly. However,if we would be able to target the plots with good performance.. the rate of success of the reccomendatio could be high…for aprox 20% of the plots (i.e., 35% of the farmers…)

library(dplyr)
library(tidyr)

# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")

# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
  group_by(IQR_block) %>%
  summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
  filter(n_valid_seasons >= 4)

# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
  filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
         IQR_Season %in% seasons,
         is.finite(Y_ratio))

# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
  group_by(IQR_block, IQR_Season) %>%
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = IQR_Season,
    values_from = Y_ratio,
    names_prefix = "Y_ratio_"
  )

# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
  group_by(IQR_block) %>%
  summarise(
    n_seasons = n_distinct(IQR_Season),
    any_below_0.95 = any(Y_ratio < 0.95),
    mean_ratio = mean(Y_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    CA_typology = case_when(
      !any_below_0.95 ~ "Good",
      any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
      any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
      mean_ratio <= 0.85 ~ "Very Poor"
    )
  )

# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
  arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>%  # Prioritize 23A > 24A > 25A
  distinct(IQR_block, .keep_all = TRUE) %>%
  dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)


# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
  left_join(RY_wide, by = "IQR_block") %>%
  left_join(block_meta, by = "IQR_block")

Now we analyse the frequency of each type

Ahore por environment

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "CA-Performance Typology Distribution by OAF-AEZ",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

## What if we target the districts with the best performnace? wht could be the rate of success? Notes: Yes better chance but it would still fail in all

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_District) +
  labs(
    title = "CA-Performance Typology Distribution by District",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

## Another way to see it

library(dplyr)
library(ggplot2)
library(glue)

# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio)) %>%
  group_by(IQR_block, IQF_District) %>%
  summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 2. Summary for annotations
district_summary <- block_means %>%
  group_by(IQF_District) %>%
  summarise(
    avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
    n_total = n(),
    n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
    percent_ge1 = round(100 * n_ge1 / n_total)
  ) %>%
  mutate(
    label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
    y_pos = 1.5
  )

# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "gray90") +
  geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
  geom_text(
    data = district_summary,
    aes(x = IQF_District, y = y_pos, label = label),
    inherit.aes = FALSE,
    size = 3.5,
    vjust = 0
  ) +
  labs(
    title = "Distribution of Mean Y_ratio per Block by District",
    x = "District",
    y = "Mean Y_ratio per Block (across seasons)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# Drivers of yield performance We start with the database of yield drivers that was already processed in the scrips: “data_mm_block” but we change the relative

data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")
## Rows: 473 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_te...
## dbl (8): ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, ICR_arable_land_avg...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drivers_data <- data_mm_block %>%
  mutate(
    CA_typology = fct_drop(CA_typology),

    IQR_block = factor(IQR_block),
    IQR_TF_tubura_client  = factor(IQR_TF_tubura_client),
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
    IQR_soil_texture      = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
    IQR_soil_color        = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
    IQF_position_slope    = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
    Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
  )

str(drivers_data)
## tibble [473 × 17] (S3: tbl_df/tbl/data.frame)
##  $ IQR_block              : Factor w/ 473 levels "10_1","10_10",..: 15 16 17 18 19 20 21 22 23 24 ...
##  $ ICR_Org_C_avg          : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
##  $ ICR_K_perc_23A         : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
##  $ ICR_Avail_P_avg        : num [1:473] 11.7 11.5 22 60.6 35.6 ...
##  $ ICR_arable_land_avg    : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
##  $ ICF_Alt_avg            : num [1:473] 1636 1293 1141 1207 1145 ...
##  $ ICR_rainfall_avg       : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
##  $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
##  $ ICF_planting_date      : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
##  $ IQR_TF_tubura_client   : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
##  $ IQR_plot_fert_quality  : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
##  $ IQR_soil_texture       : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ IQR_soil_color         : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
##  $ IQF_position_slope     : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
##  $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CA_typology            : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
##  $ IQF_environment        : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...

Now we paste the averaged relative yield along the 6 seasons to each bloc and we remove the of CA_typology

# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)

# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
  dplyr::group_by(IQR_block, IQR_Season) |>
  dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
  dplyr::group_by(IQR_block) |>
  dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |> 
  dplyr::left_join(avg_Y_ratio, by = "IQR_block")