#Libraries and Data upload

library(readr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(DataExplorer)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ stringr 1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(sf)

## Warning: package 'sf' was built under R version 4.3.3

## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE

library(rnaturalearth)

## Warning: package 'rnaturalearth' was built under R version 4.3.3

ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 11921 Columns: 94
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (54): IQR_Season, Seas_block_treat, IQR_SeasAB, Sample code, IQF_agzone,...
## dbl (39): ICF_Lat, ICF_Long, ICF_Alt, ICR_OM_perc, ICR_landprep_pers_usd_ha,...
## num  (1): DC_cost_usd_ha
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(dplyr)

ca_raw <- ca_raw %>%
  mutate(
    IQF_environment = case_when(
      IQF_environment %in% c("Mid Low Dry", "Mid_Low_Dry") ~ "Mid_Low_Dry",
      IQF_environment %in% c("Mid Low Wet", "Mid_Low_Wet") ~ "Mid_Low_Wet",
      TRUE ~ IQF_environment
    )
  )

Data pre-Cleaning

Some numeric columns are read as characters. We’ll define two cleaning functions: 1. convert_numeric_chars() - converts any character column to numeric if all values are numeric. 2. clean_numeric_columns() - replaces invalid strings (e.g. “na”, “NA”) and converts to numeric if most values are numbers. ## Numeric/factoruian check

convert_numeric_chars <- function(df) {
  df %>%
    mutate(across(where(is.character), ~ {
      if (all(grepl("^[-]?[0-9]*\\.?[0-9]+$", .x[!is.na(.x)]))) {
        as.numeric(.x)
      } else {
        .x
      }
    }))
}

# Apply it
ca_raw <- convert_numeric_chars(ca_raw)

# Function to convert character columns to numeric when appropriate
clean_numeric_columns <- function(df) {
  df <- df %>%
    mutate(across(where(is.character), function(x) {
      # Replace "NA", "na", "" with real NA
      x[x %in% c("NA", "na", "")] <- NA
      
      # Try converting to numeric
      x_num <- suppressWarnings(as.numeric(x))
      
      # If most non-NA values are numeric, keep it
      if (sum(!is.na(x_num)) > 0.9 * sum(!is.na(x))) {
        return(x_num)
      } else {
        return(x)
      }
    }))
  
  return(df)
}

# Apply it to ca_raw
ca_raw <- clean_numeric_columns(ca_raw)

# Check summary of converted classes
table(sapply(ca_raw, class))

## 
## character   numeric 
##        39        55

Explore how database is reduced if we only keep trat A and C

library(dplyr)
library(tidyr)

summary_farmers_raw <- ca_raw %>%
  filter(!is.na(IQR_Season), !is.na(IQR_block)) %>%
  group_by(IQR_Season) %>%
  summarise(
    # total farmers (no yield / treatment conditions)
    total_farmers = n_distinct(IQR_block),

    # farmers who have BOTH A and C with yield not NA and not 0
    farmers_with_A_and_C = n_distinct(IQR_block[
      Treat_code %in% c("A","C") &
        !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
        ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
            IQR_block, FUN = any) &
        ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="C",
            IQR_block, FUN = any)
    ]),

    # farmers who have A AND (B OR C) with yield not NA and not 0
    farmers_with_A_and_BorC = n_distinct(IQR_block[
      Treat_code %in% c("A","B","C") &
        !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
        ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
            IQR_block, FUN = any) &
        ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code %in% c("B","C"),
            IQR_block, FUN = any)
    ]),
    .groups = "drop"
  ) %>%
  mutate(
    farmers_with_A_and_C = replace_na(farmers_with_A_and_C, 0),
    farmers_with_A_and_BorC = replace_na(farmers_with_A_and_BorC, 0),
    pct_with_A_and_C = round(100 * farmers_with_A_and_C / total_farmers, 1),
    pct_with_A_and_BorC = round(100 * farmers_with_A_and_BorC / total_farmers, 1)
  )

summary_farmers_raw

## # A tibble: 6 × 6
##   IQR_Season total_farmers farmers_with_A_and_C farmers_with_A_and_BorC
##   <chr>              <int>                <int>                   <int>
## 1 23A                  580                  559                     559
## 2 23B                  579                  506                     507
## 3 24A                  495                  469                     471
## 4 24B                  477                  472                     472
## 5 25A                  457                  441                     442
## 6 25B                  449                  441                     442
## # ℹ 2 more variables: pct_with_A_and_C <dbl>, pct_with_A_and_BorC <dbl>

#Data cleaning ## Clean for only farm-season with A and C, and that they are not 0 or NA

library(dplyr)

ca_raw <- ca_raw %>%
  mutate(
    IQF_SeasonblockID = if_else(
      is.na(IQF_SeasonblockID),
      paste(IQR_block, IQR_Season, sep = "_"),
      IQF_SeasonblockID
    )
  )


ca_TATC <- ca_raw %>%
  # keep only A and C rows
  filter(Treat_code %in% c("A", "C")) %>%
  # group by the paired unit
  group_by(IQF_SeasonblockID) %>%
  # keep only blocks where BOTH A and C exist
  filter(all(c("A", "C") %in% Treat_code)) %>%
  # and BOTH have valid (non-NA, non-zero) yields
  filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0)) %>%
  ungroup()

ca_TATC

## # A tibble: 5,776 × 94
##    IQR_Season Seas_block_treat IQR_SeasAB `Sample code`    IQF_agzone IQF_region
##    <chr>      <chr>            <chr>      <chr>            <chr>      <chr>     
##  1 23A        23A79_15C        A          23A_i_P2_79_15_T Mayaga-Bu… S         
##  2 23A        23A79_15A        A          23A_i_P2_79_15_T Mayaga-Bu… S         
##  3 23A        23A58_10A        A          23A_i_P2_58_10_T Central P… NW        
##  4 23A        23A58_10C        A          23A_i_P2_58_10_T Central P… NW        
##  5 23A        23A53_4C         A          23A_i_P2_53_4_T  Volcanic … NW        
##  6 23A        23A53_4A         A          23A_i_P2_53_4_T  Volcanic … NW        
##  7 23A        23A38_15A        A          23A_i_P2_38_15_T Buberuka … NE        
##  8 23A        23A38_15C        A          23A_i_P2_38_15_T Buberuka … NE        
##  9 23A        23A9_14A         A          23A_i_P2_9_14_T  Eastern R… E         
## 10 23A        23A9_14C         A          23A_i_P2_9_14_T  Eastern R… E         
## # ℹ 5,766 more rows
## # ℹ 88 more variables: IQF_District <chr>, IQF_environment <chr>,
## #   IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## #   IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## #   IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## #   IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## #   ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>, …

ca_TATC  %>%
  count(IQF_SeasonblockID)

## # A tibble: 2,888 × 2
##    IQF_SeasonblockID     n
##    <chr>             <int>
##  1 102_10_23A            2
##  2 102_10_23B            2
##  3 102_10_24A            2
##  4 102_10_24B            2
##  5 102_10_25A            2
##  6 102_10_25B            2
##  7 102_11_23A            2
##  8 102_11_23B            2
##  9 102_11_24A            2
## 10 102_11_24B            2
## # ℹ 2,878 more rows

Oulier detection

Prepare data

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(lmerTest)

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

library(broom.mixed)

## Warning: package 'broom.mixed' was built under R version 4.3.3

library(dplyr)

model_data <- ca_TATC %>%
  filter(
    !is.na(DC_Yield_t_ha),
    DC_Yield_t_ha != 0,
    !is.na(Treat_code),
    !is.na(IQR_Season),
    !is.na(IQF_environment),
    !is.na(IQR_block)
  ) %>%
  mutate(
    DC_Yield_t_ha   = as.numeric(DC_Yield_t_ha),
    Treat_code      = factor(Treat_code),
    IQR_Season      = factor(IQR_Season),
    IQF_environment = factor(IQF_environment),
    IQR_block       = factor(IQR_block)
  )

###Linear mixed model for outlier detection

model_resid <- lmer(
  DC_Yield_t_ha ~ Treat_code * IQR_Season +
    (1 | IQR_Season) +
    (1 | IQF_environment) +
    (1 | IQR_block),
  data = model_data,
  REML = TRUE
)

Extract residuals

resid_data <- augment(model_resid, data = model_data) %>%
  mutate(
    std_resid = as.numeric(scale(.resid)),
    is_outlier = abs(std_resid) > 3
  )

COunt outliers

outlier_counts <- resid_data %>%
  filter(is_outlier) %>%
  group_by(IQR_Season, Treat_code) %>%
  summarise(
    n_outliers = n(),
    .groups = "drop"
  )

print(outlier_counts)

## # A tibble: 6 × 3
##   IQR_Season Treat_code n_outliers
##   <fct>      <fct>           <int>
## 1 23A        A                  13
## 2 23A        C                   4
## 3 24A        A                  17
## 4 24A        C                   7
## 5 25A        A                   8
## 6 25A        C                   7

###Inspect extreme observations

resid_data %>%
  filter(is_outlier) %>%
  select(
    IQR_Season,
    Treat_code,
    IQR_block,
    IQF_environment,
    DC_Yield_t_ha,
    std_resid
  ) %>%
  arrange(desc(abs(std_resid)))

## # A tibble: 56 × 6
##    IQR_Season Treat_code IQR_block IQF_environment DC_Yield_t_ha std_resid
##    <fct>      <fct>      <fct>     <fct>                   <dbl>     <dbl>
##  1 24A        A          53_4      Highland                 18.1      8.22
##  2 24A        A          53_3      Highland                 19.4      8.14
##  3 24A        A          53_5      Highland                 16.9      7.85
##  4 23A        A          17_6      Mid_Low_Dry              16.2      7.27
##  5 24A        C          53_5      Highland                 14.9      7.19
##  6 24A        C          53_4      Highland                 15.4      7.00
##  7 24A        C          53_3      Highland                 15.7      6.07
##  8 23A        A          17_1      Mid_Low_Dry              16.7      6.07
##  9 24A        A          49_2      Highland                 14.4      5.98
## 10 24A        A          53_1      Highland                 14.1      5.15
## # ℹ 46 more rows

Visualization of ouliers

library(dplyr)
library(ggplot2)

# Prepare base plotting data
plot_data <- ca_TATC %>%
  filter(
    !is.na(DC_Yield_t_ha),
    DC_Yield_t_ha != 0,
    !is.na(Treat_code),
    !is.na(IQR_Season),
    !is.na(IQF_agzone)
  ) %>%
  mutate(
    Treat_code = factor(Treat_code),
    IQR_Season = factor(IQR_Season),
    IQF_agzone = factor(IQF_agzone)
  )

# Join LMM outlier flag
plot_data_lmm <- plot_data %>%
  left_join(
    resid_data %>%
      select(
        IQR_Season,
        Treat_code,
        IQR_block,
        is_outlier_lmm = is_outlier
      ),
    by = c("IQR_Season", "Treat_code", "IQR_block")
  ) %>%
  mutate(
    is_outlier_lmm = ifelse(is.na(is_outlier_lmm), FALSE, is_outlier_lmm)
  )
ggplot(plot_data_lmm, aes(x = Treat_code, y = DC_Yield_t_ha)) +
  geom_boxplot(outlier.shape = NA, fill = "gray90") +
  geom_jitter(
    aes(color = is_outlier_lmm),
    width = 0.2,
    alpha = 0.7
  ) +
  scale_color_manual(
    values = c("FALSE" = "black", "TRUE" = "red"),
    labels = c("FALSE" = "Regular observation", "TRUE" = "Model outlier")
  ) +
  facet_grid(IQF_agzone ~ IQR_Season, scales = "free_y") +
  labs(
    title = "Yield (t/ha) by Treatment and Season",
    subtitle = "Red points indicate outliers detected by linear mixed-effects model (|std. resid| > 4)",
    x = "Treatment Code",
    y = "Yield (t/ha)",
    color = ""
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.text = element_text(size = 9)
  )

Found some plots with yield <0.1 tn/ha explore

library(dplyr)

threshold <- 0.1

block_season_flags <- ca_TATC %>%
  filter(!is.na(IQF_SeasonblockID), !is.na(Treat_code)) %>%
  mutate(DC_Yield_t_ha = as.numeric(DC_Yield_t_ha)) %>%
  group_by(IQF_SeasonblockID, IQR_Season) %>%
  summarise(
    # both A and C exist AND both are < 0.1
    AC_both_lt0.1 =
      any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) &
      any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),

    # A OR B is < 0.1 (either one, if present)
    AorB_lt0.1 =
      any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) |
      any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),

    .groups = "drop"
  )

# ---- Overall counts (unique block-season combinations) ----
overall_counts <- block_season_flags %>%
  summarise(
    total_block_season = n(),
    n_AC_lt0.1 = sum(AC_both_lt0.1),
    n_AorB_lt0.1 = sum(AorB_lt0.1),
    prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
    prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3)
  )

overall_counts

## # A tibble: 1 × 5
##   total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1 prop_AorB_lt0.1
##                <int>      <int>        <int>         <dbl>           <dbl>
## 1               2888          1           14             0           0.005

# ---- By season (optional) ----
by_season_counts <- block_season_flags %>%
  group_by(IQR_Season) %>%
  summarise(
    total_block_season = n(),
    n_AC_lt0.1 = sum(AC_both_lt0.1),
    n_AorB_lt0.1 = sum(AorB_lt0.1),
    prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
    prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3),
    .groups = "drop"
  )

by_season_counts

## # A tibble: 6 × 6
##   IQR_Season total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1
##   <chr>                   <int>      <int>        <int>         <dbl>
## 1 23A                       559          0            0         0    
## 2 23B                       506          1            7         0.002
## 3 24A                       469          0            1         0    
## 4 24B                       472          0            6         0    
## 5 25A                       441          0            0         0    
## 6 25B                       441          0            0         0    
## # ℹ 1 more variable: prop_AorB_lt0.1 <dbl>

Final clan data = ca_AC_clean

clean databse with no NA or <0.1 for yield and cleaned removing outliers (bothe treatments for theat season-farmer it at lease one is outlier)

library(dplyr)

# --- 1) Identify model outliers (if you already have resid_data, skip to step 2) ---
# NOTE: requires lme4, lmerTest, broom.mixed loaded and model fitted as you did

# resid_data should contain: IQR_Season, Treat_code, IQR_block, IQF_environment, .resid, is_outlier
# (from: resid_data <- augment(model_resid, data = model_data) %>% mutate(std_resid=..., is_outlier=...))

# --- 2) Drop ALL outlier blocks (balanced A/C by SeasonblockID) ---
# Rule: if either A or C is flagged as outlier within a given IQF_SeasonblockID,
#       remove the entire IQF_SeasonblockID (both treatments), so pairs remain balanced.

outlier_ids <- resid_data %>%
  filter(is_outlier) %>%
  distinct(IQF_SeasonblockID)
## not outliers
ca_AC_clean <- ca_TATC %>%
  # keep only A and C (optional, but usually desired for paired A/C analysis)
  filter(Treat_code %in% c("A", "C")) %>%
  # keep only complete A/C pairs with valid yields (recommended)
  group_by(IQF_SeasonblockID) %>%
  filter(all(c("A","C") %in% Treat_code)) %>%
  filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha >= 0.1)) %>%
  ungroup() %>%
  # drop any SeasonblockID that had an outlier in either treatment
  anti_join(outlier_ids, by = "IQF_SeasonblockID")

# Quick checks
cat("Rows before:", nrow(ca_TATC), "\n")

## Rows before: 5776

cat("Rows after :", nrow(ca_AC_clean), "\n")

## Rows after : 5654

# Confirm balance: should be 2 rows (A and C) per SeasonblockID
balance_check <- ca_AC_clean %>%
  count(IQF_SeasonblockID) %>%
  count(n)

## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.

print(balance_check)

## # A tibble: 1 × 2
##       n    nn
##   <int> <int>
## 1     2  2827

Create variables of crop and year and block_season combination

library(tidyverse)

ca_AC_clean <- ca_AC_clean %>%
  mutate(
    year = str_sub(IQR_Season, 1, 2),
    crop_code = str_sub(IQR_Season, -1),
    crop = case_when(
      crop_code == "A" ~ "Maize",
      crop_code == "B" ~ "Beans",
      TRUE ~ NA_character_
    )
  )

ca_AC_clean$block_season = paste0(ca_AC_clean$IQR_block, "_", ca_AC_clean$IQR_Season)

#New database of column of yield ratio CONV_CA_Ratio

library(dplyr)

CONV_CA_Ratio <- ca_AC_clean %>%
  # Convert profit column
  mutate(
    DC_profit_usd_ha = as.numeric(gsub(",", "", trimws(DC_profit_usd_ha)))
  ) %>%
  
  # Keep only relevant treatments
  filter(Treat_code %in% c("A", "C")) %>%
  filter(!is.na(DC_Yield_t_ha), !is.na(DC_profit_usd_ha)) %>%
  
  # Group by block-season
  group_by(IQR_block, IQR_Season) %>%
  
  # NEW: keep only groups with both A and C after filtering
  filter(n_distinct(Treat_code) == 2) %>%
  
  # Mutate safely
  mutate(
    yield_A = DC_Yield_t_ha[Treat_code == "A"],
    yield_C = DC_Yield_t_ha[Treat_code == "C"],
    profit_A = DC_profit_usd_ha[Treat_code == "A"],
    profit_C = DC_profit_usd_ha[Treat_code == "C"],
    Y_ratio = yield_C / yield_A,
    Profit_ratio = profit_C / profit_A
  ) %>%
  
  # Keep only one row per group (e.g., the C row)
  filter(Treat_code == "C") %>%
  ungroup() %>%
  filter(is.finite(Y_ratio), is.finite(Profit_ratio))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DC_profit_usd_ha = as.numeric(gsub(",", "",
##   trimws(DC_profit_usd_ha)))`.
## Caused by warning:
## ! NAs introduced by coercion

Yield and profit comparions exploration

% COnv>CA per soason-environment_Yield

library(dplyr)
library(tidyr)

A_vs_C_prop <- ca_AC_clean %>%
  filter(Treat_code %in% c("A", "C")) %>%
  select(
    IQR_Season,
    IQF_environment,
    IQF_SeasonblockID,
    Treat_code,
    DC_Yield_t_ha
  ) %>%
  pivot_wider(
    names_from = Treat_code,
    values_from = DC_Yield_t_ha
  ) %>%
  filter(!is.na(A), !is.na(C)) %>%
  group_by(IQR_Season, IQF_environment) %>%
  summarise(
    n_blocks = n(),
    prop_A_greater_C = mean(A > C),
    prop_C_greater_A = mean(C > A),
    prop_equal = mean(A == C),
    .groups = "drop"
  )

A_vs_C_prop

## # A tibble: 24 × 6
##    IQR_Season IQF_environment n_blocks prop_A_greater_C prop_C_greater_A
##    <chr>      <chr>              <int>            <dbl>            <dbl>
##  1 23A        Highland             120            0.692           0.308 
##  2 23A        Mid_Low_Dry          173            0.728           0.266 
##  3 23A        Mid_Low_Wet          145            0.745           0.255 
##  4 23A        Transition           100            0.68            0.31  
##  5 23B        Highland             111            0.694           0.270 
##  6 23B        Mid_Low_Dry          160            0.906           0.0875
##  7 23B        Mid_Low_Wet          132            0.773           0.212 
##  8 23B        Transition            90            0.756           0.233 
##  9 24A        Highland             104            0.971           0.0288
## 10 24A        Mid_Low_Dry          146            0.753           0.233 
## # ℹ 14 more rows
## # ℹ 1 more variable: prop_equal <dbl>

##Visual distribution of CONV advantage vs CA_Yield

library(ggplot2)

ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
  geom_density(alpha = 0.3) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(title = "Density of Relative Yield (CA / CONV) by Agzone")

##Visual distribution of CONV advantage vs CA_profit

ggplot(CONV_CA_Ratio, aes(x = Profit_ratio, fill = IQF_environment)) +
  geom_density(alpha = 0.5, adjust = 1.2) +  # <-- smoother
   geom_vline(xintercept = 1, linetype = "dashed") +
  scale_fill_brewer(palette = "Pastel1") +
  coord_cartesian(xlim = c(-1, 6)) +
  labs(
    title = "Density of Relative Profit (CA / CONV) by Agzone",
    x = "Profit_ratio",
    y = "Density"
  ) +
  theme_minimal()

## Another way to see the disctribution of the advantage of CONV_Yield Overall about 22% of the block-season combinations had CA as a similar or better performed than CONV

library(dplyr)
library(ggplot2)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  arrange(Y_ratio) %>%
  filter(is.finite(Y_ratio)) %>%
  mutate(
    season_crop = case_when(
      IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
      IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
      TRUE ~ "Other"
    ),
    cum_percent = 100 * (row_number() / n())
  )

# 2. Estimate intersection y-value at x = 1 (Y_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
  group_by(IQF_environment) %>%
  filter(n() >= 5) %>%
  do({
    tryCatch({
      model <- loess(cum_percent ~ Y_ratio, data = .)
      y_at_1 <- predict(model, newdata = data.frame(Y_ratio = 1))
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
    }, error = function(e) {
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
    })
  }) %>%
  ungroup()

# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")

# 4. Plot
ggplot(plot_data, aes(x = Y_ratio, y = cum_percent)) +
  geom_point(aes(color = season_crop), size = 2) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
  geom_text(
    data = intersection_lines,
    aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
    inherit.aes = FALSE,
    color = "blue",
    size = 3,
    hjust = 0
  ) +
  scale_color_manual(
    values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
  ) +
  scale_x_continuous(limits = c(0, 2.5)) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap(~ IQF_environment, scales = "free_y") +
  labs(
    x = "Relative Yield (CA / CONV)",
    y = "Cumulative % of Observations",
    title = "Cumulative Frequency of Relative Yield at Block Level",
    subtitle = "Dashed = 1 yield threshold; Dotted = intersect with cumulative curve",
    color = "Season Crop Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 10, hjust = 0.5),
    strip.text = element_text(face = "bold"),
    legend.position = "right"
  )

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).

##Same distribution but for profit IN terms of profit, the advantage of CONV is a little larger

library(dplyr)
library(ggplot2)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  arrange(Profit_ratio) %>%
  filter(is.finite(Profit_ratio)) %>%
  mutate(
    season_crop = case_when(
      IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
      IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
      TRUE ~ "Other"
    ),
    cum_percent = 100 * (row_number() / n())
  )

# 2. Estimate intersection y-value at x = 1 (Profit_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
  group_by(IQF_environment) %>%
  filter(n() >= 5) %>%
  do({
    tryCatch({
      model <- loess(cum_percent ~ Profit_ratio, data = .)
      y_at_1 <- predict(model, newdata = data.frame(Profit_ratio = 1))
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
    }, error = function(e) {
      data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
    })
  }) %>%
  ungroup()

# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")

# 4. Plot
ggplot(plot_data, aes(x = Profit_ratio, y = cum_percent)) +
  geom_point(aes(color = season_crop), size = 2, alpha = 0.6) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
  geom_text(
    data = intersection_lines,
    aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
    inherit.aes = FALSE,
    color = "blue",
    size = 3,
    hjust = 0
  ) +
  scale_color_manual(
    values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
  ) +
  scale_x_continuous(limits = c(-2, 2.5)) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap(~ IQF_environment, scales = "free_y") +
  labs(
    x = "Relative Profit (CA / CONV)",
    y = "Cumulative % of Observations",
    title = "Cumulative Frequency of Relative Profit at Block Level",
    subtitle = "Dashed = 1 profit threshold; Dotted = intersect with cumulative curve",
    color = "Season Crop Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 10, hjust = 0.5),
    strip.text = element_text(face = "bold"),
    legend.position = "right"
  )

## Warning: Removed 119 rows containing missing values or values outside the scale range
## (`geom_point()`).

## What is the relation among performance in Yield and Performance in Profit separating by crop?

library(dplyr)
library(ggplot2)

# 1. Prepare filtered data
regression_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    is.finite(Profit_ratio),
    Profit_ratio > -5, Profit_ratio < 5,
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  )

# 2. Create plot with regression line and 1:1 reference line
ggplot(regression_data, aes(x = Y_ratio, y = Profit_ratio)) +
  geom_point(alpha = 0.4, color = "gray40", size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "blue", linewidth = 0.8) +
  facet_wrap(~crop_type) +
  labs(
    title = "Relationship Between Yield Ratio and Profit Ratio by Crop",
    x = "Relative Yield (Y_ratio: CA / CONV)",
    y = "Relative Profit (Profit_ratio: CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

## `geom_smooth()` using formula = 'y ~ x'

#How does the performance of CA is affected by the environmental index (yield of the control?) ## First we construct an environment index This is the relative yield of each of the treatments in a block as compared to all the blocks of that treatment in the same season and environment. Theh we average the relative yield of all the treatments in a block for a season and we have the environmental indeal of it expresed as a ratio where if > 1 then the environment as expresed by the crops performance was good Notes: As a whole, the relation is weak, however, yield performance of CA (CA:CONV ratio) start to see more very bad scores for relatively poor environments (EI<1)

library(dplyr)

# Step 1: Make sure relevant columns exist
df <- ca_AC_clean %>%
  filter(!is.na(DC_Yield_t_ha), !is.na(Treat_code), !is.na(block_season), !is.na(IQR_Season), !is.na(IQF_environment))

# Step 2: Compute average yield per Treat_code within IQR_Season + IQF_environment
ref_yields <- df %>%
  group_by(Treat_code, IQR_Season, IQF_environment) %>%
  summarise(avg_env_yield = mean(DC_Yield_t_ha, na.rm = TRUE), .groups = "drop")

# Step 3: Join reference yields back to main data
df_with_ref <- df %>%
  left_join(ref_yields, by = c("Treat_code", "IQR_Season", "IQF_environment")) %>%
  mutate(
    yield_ratio_env = DC_Yield_t_ha / avg_env_yield
  )

# Step 4: Compute average of those ratios per block-season
env_index_df <- df_with_ref %>%
  group_by(block_season) %>%
  summarise(env_index = mean(yield_ratio_env, na.rm = TRUE), .groups = "drop")

# Step 5: View result
head(env_index_df)

## # A tibble: 6 × 2
##   block_season env_index
##   <chr>            <dbl>
## 1 102_10_23A       0.721
## 2 102_10_23B       0.816
## 3 102_10_24A       1.12 
## 4 102_10_24B       0.862
## 5 102_10_25A       1.26 
## 6 102_10_25B       1.18

# Then we paste it into CONV_CA_Ratio
# Merge environment index into CONV_CA_Ratio by block_season
CONV_CA_Ratio <- CONV_CA_Ratio %>%
  left_join(env_index_df, by = "block_season")

# View result
head(CONV_CA_Ratio)

## # A tibble: 6 × 105
##   IQR_Season Seas_block_treat IQR_SeasAB `Sample code`    IQF_agzone  IQF_region
##   <chr>      <chr>            <chr>      <chr>            <chr>       <chr>     
## 1 23A        23A79_15C        A          23A_i_P2_79_15_T Mayaga-Bug… S         
## 2 23A        23A58_10C        A          23A_i_P2_58_10_T Central Pl… NW        
## 3 23A        23A53_4C         A          23A_i_P2_53_4_T  Volcanic c… NW        
## 4 23A        23A38_15C        A          23A_i_P2_38_15_T Buberuka h… NE        
## 5 23A        23A9_14C         A          23A_i_P2_9_14_T  Eastern Ri… E         
## 6 23A        23A32_3C         A          23A_i_P2_32_3_T  Eastern Sa… NE        
## # ℹ 99 more variables: IQF_District <chr>, IQF_environment <chr>,
## #   IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## #   IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## #   IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## #   IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## #   ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>,
## #   IQR_soil_texture <chr>, IQR_soil_color <chr>, …

Now we evaluate the relation among EI and evonomic and yield avdantage of CA..for amize and beans separated

library(dplyr)
library(ggplot2)
library(ggpmisc)  # for equation and p-value annotation

## Loading required package: ggpp

## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2

## 
## Attaching package: 'ggpp'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Registered S3 method overwritten by 'ggpmisc':
##   method                  from   
##   as.character.polynomial polynom

# 1. Prepare data (only Yield Ratio, up to 2.5)
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Plot with regression + equation
ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
  stat_poly_eq(
    aes(label = paste(..eq.label.., ..p.value.label.., sep = "~~~")),
    formula = y ~ x,
    parse = TRUE,
    label.x = "left", label.y = "top",
    size = 3.5
  ) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Relationship Between Environment Index and Yield Ratio (CA / CONV)",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

## Warning: The dot-dot notation (`..eq.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(eq.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Segmented linear model

# Load required packages
library(dplyr)
library(ggplot2)
library(segmented)

## Warning: package 'segmented' was built under R version 4.3.3

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:lme4':
## 
##     lmList

## The following object is masked from 'package:dplyr':
## 
##     collapse

library(glue)

## Warning: package 'glue' was built under R version 4.3.3

library(purrr)

# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Fit piecewise regression per crop, extract slopes and p-values safely
segmented_results <- plot_data %>%
  group_split(crop) %>%
  map_df(function(df) {
    crop_name <- unique(df$crop)

    # Fit linear model
    lm_fit <- lm(Y_ratio ~ env_index, data = df)

    # Try segmented model
    seg_fit <- tryCatch(
      segmented(lm_fit, seg.Z = ~ env_index, psi = median(df$env_index)),
      error = function(e) NULL
    )

    if (!is.null(seg_fit)) {
      df$fit <- fitted(seg_fit)
      df$breakpoint <- seg_fit$psi[2]

      # Try to extract slopes
      slopes_df <- tryCatch(slope(seg_fit)$env_index, error = function(e) NULL)

      if (!is.null(slopes_df) && nrow(slopes_df) >= 2) {
        slope1 <- slopes_df[1, "Est."]
        slope2 <- slopes_df[2, "Est."]

        # Check if "p" column exists before accessing
        if ("p" %in% colnames(slopes_df)) {
          pval1 <- slopes_df[1, "p"]
          pval2 <- slopes_df[2, "p"]
          df$slope_label <- glue("Slope1 = {round(slope1, 3)} (p = {format.pval(pval1, digits = 2)})\n",
                                 "Slope2 = {round(slope2, 3)} (p = {format.pval(pval2, digits = 2)})")
        } else {
          df$slope_label <- glue("Slope1 = {round(slope1, 3)}\nSlope2 = {round(slope2, 3)}")
        }

      } else {
        df$slope_label <- "Slopes not available"
      }

    } else {
      df$fit <- NA
      df$breakpoint <- NA
      df$slope_label <- "Model failed"
    }

    df
  })

# 3. Plot with segmented line and slope annotations
ggplot(segmented_results, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_line(aes(y = fit), color = "darkred", linewidth = 1) +
  geom_text(
    data = segmented_results %>%
      group_by(crop) %>%
      slice(1),  # only show one label per facet
    aes(x = -Inf, y = Inf, label = slope_label),
    hjust = -0.05, vjust = 1.1,
    inherit.aes = FALSE,
    size = 3.5,
    fontface = "italic"
  ) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Piecewise Linear Relationship Between Env Index and Yield Ratio",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

##Polinomial?

library(dplyr)
library(ggplot2)
library(broom)
library(glue)

# 1. Prepare data
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Y_ratio),
    is.finite(env_index),
    Y_ratio <= 2.5
  )

# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
  group_by(crop) %>%
  group_map(~ {
    df <- .x
    crop_label <- .y$crop

    models <- list(
      Linear = lm(Y_ratio ~ env_index, data = df),
      Quadratic = lm(Y_ratio ~ poly(env_index, 2), data = df),
      Cubic = lm(Y_ratio ~ poly(env_index, 3), data = df),
      LOESS = loess(Y_ratio ~ env_index, data = df)
    )

    r2_vals <- sapply(models, function(m) {
      s <- tryCatch(summary(m), error = function(e) NULL)
      if (!is.null(s$r.squared)) s$r.squared else NA_real_
    })

    tibble(
      crop = crop_label,
      model = names(models),
      r2 = as.numeric(r2_vals)
    )
  }) %>%
  bind_rows() %>%
  mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))

# 3. Create the base plot
p <- ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(0, 2.5)) +
  labs(
    title = "Model Shape Comparison: Yield Ratio vs Environment Index",
    x = "Environment Index",
    y = "Yield Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

# 4. Add all smooth lines
p <- p +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
  geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")

# 5. Position for R² text
r2_data <- r2_data %>%
  group_by(crop) %>%
  mutate(
    y = 2.5 - 0.2 * (row_number() - 1),
    x = min(plot_data$env_index, na.rm = TRUE) + 0.05
  )

# 6. Add R² labels to plot
p + geom_text(
  data = r2_data,
  aes(x = x, y = y, label = r2_label),
  inherit.aes = FALSE,
  hjust = 0,
  size = 3.5,
  fontface = "italic"
)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

##Now with profit

library(dplyr)
library(ggplot2)
library(broom)
library(glue)

# 1. Prepare data (limit extreme profit ratios)
plot_data <- CONV_CA_Ratio %>%
  filter(
    crop %in% c("Maize", "Beans"),
    is.finite(Profit_ratio),
    is.finite(env_index),
    Profit_ratio <= 5,  # optional: limit outliers
    Profit_ratio >= -5
  )

# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
  group_by(crop) %>%
  group_map(~ {
    df <- .x
    crop_label <- .y$crop

    models <- list(
      Linear = lm(Profit_ratio ~ env_index, data = df),
      Quadratic = lm(Profit_ratio ~ poly(env_index, 2), data = df),
      Cubic = lm(Profit_ratio ~ poly(env_index, 3), data = df),
      LOESS = loess(Profit_ratio ~ env_index, data = df)
    )

    r2_vals <- sapply(models, function(m) {
      s <- tryCatch(summary(m), error = function(e) NULL)
      if (!is.null(s$r.squared)) s$r.squared else NA_real_
    })

    tibble(
      crop = crop_label,
      model = names(models),
      r2 = as.numeric(r2_vals)
    )
  }) %>%
  bind_rows() %>%
  mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))

# 3. Create base plot
p <- ggplot(plot_data, aes(x = env_index, y = Profit_ratio)) +
  geom_point(alpha = 0.4, size = 2) +
  facet_wrap(~ crop) +
  scale_y_continuous(limits = c(-5, 5)) +
  labs(
    title = "Model Shape Comparison: Profit Ratio vs Environment Index",
    x = "Environment Index",
    y = "Profit Ratio (CA / CONV)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

# 4. Add all smooth trendlines
p <- p +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
  geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")

# 5. Position R² labels
r2_data <- r2_data %>%
  group_by(crop) %>%
  mutate(
    y = 5 - 0.4 * (row_number() - 1),
    x = min(plot_data$env_index, na.rm = TRUE) + 0.05
  )

# 6. Add R² labels
p + geom_text(
  data = r2_data,
  aes(x = x, y = y, label = r2_label),
  inherit.aes = FALSE,
  hjust = 0,
  size = 3.5,
  fontface = "italic"
)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Does the % wins of CA increse progresibely from season 1 to 6? Separated by crop

NOtes: NO, it seems that there is actually not a general trend for yield imperovement (for each crops separately) along the 3 years. However, there is yes plots where there is a positive trend; more for beans than for maize. This may be related to the fact that initially the cover crop did lots of competition with the beans and was corrected a little each season to reduce it..???

library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)

# -----------------------------
# 1. Prepare data
# -----------------------------
trend_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    year %in% c("23", "24", "25"),
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    year = as.numeric(year),
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  )

# -----------------------------
# 2. Function to compute slope counts
# -----------------------------
get_slope_label <- function(df) {
  # One slope per block
  slopes <- df %>%
    group_by(IQR_block) %>%
    filter(n_distinct(year) >= 2) %>%
    summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
    mutate(
      direction = case_when(
        slope > 0 ~ "↑ Positive",
        slope < 0 ~ "↓ Negative",
        TRUE ~ "= Flat"
      )
    ) %>%
    count(direction) %>%
    complete(direction, fill = list(n = 0))

  # Format label text
  paste0(
    "↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
    "↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
    "= Flat: ", slopes$n[slopes$direction == "= Flat"]
  )
}


# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))

# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)

# -----------------------------
# 3. MAIZE plot
# -----------------------------
p_maize <- ggplot(
  trend_data %>% filter(crop_type == "Maize"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "steelblue",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_maize,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Block-level and Overall Linear Trends of Y_ratio (Maize)",
    x = "Year",
    y = "Relative Yield (Y_ratio)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_maize,
## : Ignoring unknown parameters: `label.size`

# -----------------------------
# 4. BEANS plot
# -----------------------------
p_beans <- ggplot(
  trend_data %>% filter(crop_type == "Beans"),
  aes(x = year, y = Y_ratio)
) +
  geom_point(alpha = 0.35, size = 2) +
  
  # Block-level linear trends
  geom_smooth(
    aes(group = IQR_block),
    method = "lm",
    se = FALSE,
    color = "darkorange",
    linewidth = 0.6
  ) +
  
  # Global trend
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red",
    linewidth = 1.3
  ) +
  
  # Annotation
  annotate(
    "label",
    x = 23.05,
    y = y_max * 0.95,
    label = label_beans,
    hjust = 0,
    vjust = 1,
    size = 4.5,
    fontface = "bold",
    fill = "white",
    label.size = 0.4
  ) +
  
  scale_x_continuous(breaks = c(23, 24, 25)) +
  labs(
    title = "Block-level and Overall Linear Trends of Y_ratio (Beans)",
    x = "Year",
    y = "Relative Yield (Y_ratio)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_beans,
## : Ignoring unknown parameters: `label.size`

# -----------------------------
# 5. Print plots
# -----------------------------
print(p_maize)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

print(p_beans)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## For a farm, the trend for maize is same as for beans? NOte: There is not a clear relation among the trent in yield for maize and beans (that is the trend of yield along seasons is almost independent among crops for the same farm)

library(dplyr)
library(ggplot2)
library(tidyr)

# STEP 1: Calculate slopes per crop per block
slope_data <- CONV_CA_Ratio %>%
  filter(
    is.finite(Y_ratio),
    year %in% c("23", "24", "25"),
    IQR_SeasAB %in% c("A", "B")
  ) %>%
  mutate(
    year = as.numeric(year),
    crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
  ) %>%
  group_by(IQR_block, crop_type) %>%
  filter(n_distinct(year) >= 2) %>%
  summarise(
    slope = coef(lm(Y_ratio ~ year))[2],
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = crop_type,
    values_from = slope
  ) %>%
  filter(!is.na(Maize) & !is.na(Beans))

# STEP 2: Classify quadrants
slope_data <- slope_data %>%
  mutate(
    quadrant = case_when(
      Maize > 0 & Beans > 0 ~ "Q1: ↑ Maize & ↑ Beans",
      Maize < 0 & Beans > 0 ~ "Q2: ↓ Maize & ↑ Beans",
      Maize < 0 & Beans < 0 ~ "Q3: ↓ Maize & ↓ Beans",
      Maize > 0 & Beans < 0 ~ "Q4: ↑ Maize & ↓ Beans",
      TRUE ~ "Other"
    )
  )

# STEP 3: Count blocks per quadrant
quad_counts <- slope_data %>%
  count(quadrant) %>%
  filter(quadrant != "Other")

# STEP 4: Coordinates for text labels per quadrant
label_positions <- tibble::tibble(
  quadrant = c("Q1: ↑ Maize & ↑ Beans", "Q2: ↓ Maize & ↑ Beans",
               "Q3: ↓ Maize & ↓ Beans", "Q4: ↑ Maize & ↓ Beans"),
  x = c( max(slope_data$Maize, na.rm = TRUE) * 0.85,
         min(slope_data$Maize, na.rm = TRUE) * 0.85,
         min(slope_data$Maize, na.rm = TRUE) * 0.85,
         max(slope_data$Maize, na.rm = TRUE) * 0.85),
  y = c( max(slope_data$Beans, na.rm = TRUE) * 0.85,
         max(slope_data$Beans, na.rm = TRUE) * 0.85,
         min(slope_data$Beans, na.rm = TRUE) * 0.85,
         min(slope_data$Beans, na.rm = TRUE) * 0.85)
) %>%
  left_join(quad_counts, by = "quadrant") %>%
  mutate(label = paste0(quadrant, "\nN = ", n))


# STEP 5: Final plot
ggplot(slope_data, aes(x = Maize, y = Beans)) +
  geom_point(alpha = 0.7, size = 2.5, color = "purple") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted") +
  geom_text(data = label_positions,
            aes(x = x, y = y, label = label),
            size = 4.2, hjust = 0.5, vjust = 0.5, fontface = "bold") +
  labs(
    title = "Comparison of Yield Trends (Slopes) per Block",
    subtitle = "Each point is one IQR_block — comparing Maize vs Beans trend slopes",
    x = "Maize trend slope",
    y = "Beans trend slope"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )

Mix models to evaluate treatment effect in absolute yield values, the interactions and the destribution of the residuals

Mix model and interactions

Note: Effectes of the 3 factors are significant and also the interactions. So the CA vs. CONV comparison needs to be opened and evaluated by season-environment combination

fit = lmer(DC_Yield_t_ha~IQR_Season + IQF_environment + Treat_code +
             IQR_Season:IQF_environment +  IQF_environment:Treat_code + IQR_Season:Treat_code+
              IQR_Season:IQF_environment:Treat_code+
             (1|IQR_block),
           data = ca_AC_clean)


summary(fit)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## DC_Yield_t_ha ~ IQR_Season + IQF_environment + Treat_code + IQR_Season:IQF_environment +  
##     IQF_environment:Treat_code + IQR_Season:Treat_code + IQR_Season:IQF_environment:Treat_code +  
##     (1 | IQR_block)
##    Data: ca_AC_clean
## 
## REML criterion at convergence: 18145.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1423 -0.5809 -0.0357  0.5168  3.8186 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  IQR_block (Intercept) 0.7352   0.8574  
##  Residual              1.1814   1.0869  
## Number of obs: 5654, groups:  IQR_block, 567
## 
## Fixed effects:
##                                                        Estimate Std. Error
## (Intercept)                                             5.07265    0.11767
## IQR_Season23B                                          -3.89347    0.14310
## IQR_Season24A                                          -0.57716    0.14608
## IQR_Season24B                                          -3.43020    0.14315
## IQR_Season25A                                          -1.31273    0.30002
## IQR_Season25B                                          -3.65663    0.14737
## IQF_environmentMid_Low_Dry                              0.50789    0.14817
## IQF_environmentMid_Low_Wet                             -0.57116    0.15895
## IQF_environmentTransition                              -0.33084    0.17251
## Treat_codeC                                            -0.58904    0.13926
## IQR_Season23B:IQF_environmentMid_Low_Dry                0.31837    0.18685
## IQR_Season24A:IQF_environmentMid_Low_Dry                0.92759    0.19113
## IQR_Season24B:IQF_environmentMid_Low_Dry               -0.41498    0.18889
## IQR_Season25A:IQF_environmentMid_Low_Dry               -0.28488    0.32428
## IQR_Season25B:IQF_environmentMid_Low_Dry                0.15796    0.19279
## IQR_Season23B:IQF_environmentMid_Low_Wet                0.39614    0.19425
## IQR_Season24A:IQF_environmentMid_Low_Wet                0.28734    0.19991
## IQR_Season24B:IQF_environmentMid_Low_Wet               -0.02848    0.19666
## IQR_Season25A:IQF_environmentMid_Low_Wet                1.18206    0.33027
## IQR_Season25B:IQF_environmentMid_Low_Wet                0.26227    0.20164
## IQR_Season23B:IQF_environmentTransition                 0.75971    0.21271
## IQR_Season24A:IQF_environmentTransition                 0.34742    0.21956
## IQR_Season24B:IQF_environmentTransition                 0.67783    0.21652
## IQR_Season25A:IQF_environmentTransition                 1.17576    0.33871
## IQR_Season25B:IQF_environmentTransition                 0.75964    0.22011
## IQF_environmentMid_Low_Dry:Treat_codeC                 -0.12676    0.18153
## IQF_environmentMid_Low_Wet:Treat_codeC                  0.02037    0.18881
## IQF_environmentTransition:Treat_codeC                   0.10068    0.20647
## IQR_Season23B:Treat_codeC                               0.37611    0.20078
## IQR_Season24A:Treat_codeC                              -1.23510    0.20469
## IQR_Season24B:Treat_codeC                              -0.06783    0.20058
## IQR_Season25A:Treat_codeC                              -0.15221    0.40874
## IQR_Season25B:Treat_codeC                               0.29254    0.20575
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC   -0.24892    0.26193
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC    1.01731    0.26749
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC    0.35035    0.26445
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC   -0.44379    0.43993
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC    0.04390    0.26910
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC   -0.07430    0.27266
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC    1.00901    0.28010
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC    0.19780    0.27563
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC   -0.16033    0.44962
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC    0.06295    0.28196
## IQR_Season23B:IQF_environmentTransition:Treat_codeC    -0.19715    0.29854
## IQR_Season24A:IQF_environmentTransition:Treat_codeC     0.75924    0.30740
## IQR_Season24B:IQF_environmentTransition:Treat_codeC     0.04022    0.30248
## IQR_Season25A:IQF_environmentTransition:Treat_codeC    -0.27436    0.46167
## IQR_Season25B:IQF_environmentTransition:Treat_codeC    -0.17918    0.30722
##                                                              df t value
## (Intercept)                                          3946.84230  43.108
## IQR_Season23B                                        5007.77869 -27.209
## IQR_Season24A                                        5014.15153  -3.951
## IQR_Season24B                                        5029.94067 -23.963
## IQR_Season25A                                        5095.20364  -4.375
## IQR_Season25B                                        5036.77759 -24.812
## IQF_environmentMid_Low_Dry                           5265.71529   3.428
## IQF_environmentMid_Low_Wet                           4244.04672  -3.593
## IQF_environmentTransition                            4466.82592  -1.918
## Treat_codeC                                          4934.68676  -4.230
## IQR_Season23B:IQF_environmentMid_Low_Dry             5024.75069   1.704
## IQR_Season24A:IQF_environmentMid_Low_Dry             5028.40349   4.853
## IQR_Season24B:IQF_environmentMid_Low_Dry             5038.70327  -2.197
## IQR_Season25A:IQF_environmentMid_Low_Dry             5152.64948  -0.879
## IQR_Season25B:IQF_environmentMid_Low_Dry             5044.03679   0.819
## IQR_Season23B:IQF_environmentMid_Low_Wet             5004.70035   2.039
## IQR_Season24A:IQF_environmentMid_Low_Wet             5014.82054   1.437
## IQR_Season24B:IQF_environmentMid_Low_Wet             5023.55760  -0.145
## IQR_Season25A:IQF_environmentMid_Low_Wet             5151.71514   3.579
## IQR_Season25B:IQF_environmentMid_Low_Wet             5029.47221   1.301
## IQR_Season23B:IQF_environmentTransition              5024.22535   3.572
## IQR_Season24A:IQF_environmentTransition              5036.14781   1.582
## IQR_Season24B:IQF_environmentTransition              5048.34564   3.131
## IQR_Season25A:IQF_environmentTransition              5140.32873   3.471
## IQR_Season25B:IQF_environmentTransition              5049.16391   3.451
## IQF_environmentMid_Low_Dry:Treat_codeC               4934.73832  -0.698
## IQF_environmentMid_Low_Wet:Treat_codeC               4934.43099   0.108
## IQF_environmentTransition:Treat_codeC                4936.59168   0.488
## IQR_Season23B:Treat_codeC                            4934.54897   1.873
## IQR_Season24A:Treat_codeC                            4933.26541  -6.034
## IQR_Season24B:Treat_codeC                            4934.27215  -0.338
## IQR_Season25A:Treat_codeC                            4932.21882  -0.372
## IQR_Season25B:Treat_codeC                            4933.17413   1.422
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 4934.36696  -0.950
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 4933.62520   3.803
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.92198   1.325
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 4932.37895  -1.009
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.18903   0.163
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.13254  -0.272
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 4933.57546   3.602
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.53452   0.718
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 4932.40823  -0.357
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 4933.03230   0.223
## IQR_Season23B:IQF_environmentTransition:Treat_codeC  4936.75531  -0.660
## IQR_Season24A:IQF_environmentTransition:Treat_codeC  4934.11571   2.470
## IQR_Season24B:IQF_environmentTransition:Treat_codeC  4935.72560   0.133
## IQR_Season25A:IQF_environmentTransition:Treat_codeC  4932.90235  -0.594
## IQR_Season25B:IQF_environmentTransition:Treat_codeC  4934.01727  -0.583
##                                                      Pr(>|t|)    
## (Intercept)                                           < 2e-16 ***
## IQR_Season23B                                         < 2e-16 ***
## IQR_Season24A                                        7.89e-05 ***
## IQR_Season24B                                         < 2e-16 ***
## IQR_Season25A                                        1.24e-05 ***
## IQR_Season25B                                         < 2e-16 ***
## IQF_environmentMid_Low_Dry                           0.000613 ***
## IQF_environmentMid_Low_Wet                           0.000330 ***
## IQF_environmentTransition                            0.055199 .  
## Treat_codeC                                          2.38e-05 ***
## IQR_Season23B:IQF_environmentMid_Low_Dry             0.088461 .  
## IQR_Season24A:IQF_environmentMid_Low_Dry             1.25e-06 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry             0.028071 *  
## IQR_Season25A:IQF_environmentMid_Low_Dry             0.379705    
## IQR_Season25B:IQF_environmentMid_Low_Dry             0.412614    
## IQR_Season23B:IQF_environmentMid_Low_Wet             0.041471 *  
## IQR_Season24A:IQF_environmentMid_Low_Wet             0.150693    
## IQR_Season24B:IQF_environmentMid_Low_Wet             0.884848    
## IQR_Season25A:IQF_environmentMid_Low_Wet             0.000348 ***
## IQR_Season25B:IQF_environmentMid_Low_Wet             0.193442    
## IQR_Season23B:IQF_environmentTransition              0.000358 ***
## IQR_Season24A:IQF_environmentTransition              0.113631    
## IQR_Season24B:IQF_environmentTransition              0.001754 ** 
## IQR_Season25A:IQF_environmentTransition              0.000522 ***
## IQR_Season25B:IQF_environmentTransition              0.000563 ***
## IQF_environmentMid_Low_Dry:Treat_codeC               0.485007    
## IQF_environmentMid_Low_Wet:Treat_codeC               0.914091    
## IQF_environmentTransition:Treat_codeC                0.625834    
## IQR_Season23B:Treat_codeC                            0.061097 .  
## IQR_Season24A:Treat_codeC                            1.72e-09 ***
## IQR_Season24B:Treat_codeC                            0.735259    
## IQR_Season25A:Treat_codeC                            0.709621    
## IQR_Season25B:Treat_codeC                            0.155149    
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 0.341986    
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 0.000145 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.185282    
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 0.313132    
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.870420    
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 0.785259    
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 0.000319 ***
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.473017    
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 0.721415    
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.823344    
## IQR_Season23B:IQF_environmentTransition:Treat_codeC  0.509028    
## IQR_Season24A:IQF_environmentTransition:Treat_codeC  0.013548 *  
## IQR_Season24B:IQF_environmentTransition:Treat_codeC  0.894235    
## IQR_Season25A:IQF_environmentTransition:Treat_codeC  0.552351    
## IQR_Season25B:IQF_environmentTransition:Treat_codeC  0.559773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## Correlation matrix not shown by default, as p = 48 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

anova(fit)

## Type III Analysis of Variance Table with Satterthwaite's method
##                                        Sum Sq Mean Sq NumDF  DenDF   F value
## IQR_Season                            10906.4 2181.29     5 5090.1 1846.3812
## IQF_environment                         144.2   48.06     3 1584.7   40.6770
## Treat_code                              456.2  456.21     1 4932.1  386.1684
## IQR_Season:IQF_environment              572.7   38.18    15 5134.1   32.3174
## IQF_environment:Treat_code               10.0    3.32     3 4935.8    2.8144
## IQR_Season:Treat_code                   106.8   21.36     5 4932.0   18.0778
## IQR_Season:IQF_environment:Treat_code    42.0    2.80    15 4933.6    2.3681
##                                          Pr(>F)    
## IQR_Season                            < 2.2e-16 ***
## IQF_environment                       < 2.2e-16 ***
## Treat_code                            < 2.2e-16 ***
## IQR_Season:IQF_environment            < 2.2e-16 ***
## IQF_environment:Treat_code             0.037797 *  
## IQR_Season:Treat_code                 < 2.2e-16 ***
## IQR_Season:IQF_environment:Treat_code  0.002128 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visual of CA:CONV be season-environemnt convination

# Load required libraries
library(dplyr)
library(ggplot2)

# 1. Filter and prepare main data
plot_data <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio), Y_ratio <= 5)

# 2. Compute p-values, significance, and means
sig_labels <- plot_data %>%
  group_by(IQR_Season, IQF_environment, crop) %>%
  summarise(
    mean_y = mean(Y_ratio, na.rm = TRUE),
    p_value = tryCatch(t.test(Y_ratio, mu = 1)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  mutate(
    signif = case_when(
      is.na(p_value) ~ "",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    mean_label = round(mean_y, 2)  # Rounded mean
  )

# 3. Get max Y for each group to place stars
y_pos <- plot_data %>%
  group_by(IQR_Season, IQF_environment, crop) %>%
  summarise(y_max = max(Y_ratio, na.rm = TRUE), .groups = "drop")

# 4. Merge label + position
sig_labels <- left_join(sig_labels, y_pos, by = c("IQR_Season", "IQF_environment", "crop"))

# 5. Plot
ggplot(plot_data, aes(x = IQR_Season, y = Y_ratio, color = crop)) +
  geom_boxplot(outlier.alpha = 0.3) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
  facet_wrap(~ IQF_environment) +

  # Add significance stars
  geom_text(
    data = sig_labels,
    aes(x = IQR_Season, y = y_max + 0.15, label = signif, group = crop),
    inherit.aes = FALSE,
    size = 5,
    fontface = "bold"
  ) +

  # Add mean values just above the stars
  geom_text(
    data = sig_labels,
    aes(x = IQR_Season, y = y_max + 0.7, label = mean_label, group = crop),
    inherit.aes = FALSE,
    size = 3.5,
    fontface = "plain",
    color = "black"
  ) +

  labs(
    title = "Yield Ratio (CA / CONV) by Season and Environment",
    x = "Season",
    y = "Yield Ratio"
  ) +
  ylim(0, 5.7) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    strip.text = element_text(face = "bold")
  )

### Distribution of the residuals from mix model Although Shapiro–Wilk tests indicated statistically significant deviations from normality (p < 0.001), that my be because of the large sample size; W=0.99 the residuals are close to normality and Q–Q plots showed no substantial departures from normality, indicating that model assumptions were adequately met.

library(broom.mixed)
library(dplyr)

resid_df <- augment(fit) %>%
  mutate(
    resid = .resid,
    fitted = .fitted
  )
library(ggplot2)

ggplot(resid_df, aes(x = resid)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40,
                 fill = "grey80",
                 color = "black") +
  geom_density(color = "blue", linewidth = 1) +
  labs(
    title = "Distribution of residuals",
    x = "Residuals",
    y = "Density"
  ) +
  theme_minimal()

ggplot(resid_df, aes(sample = resid)) +
  stat_qq(color = "black") +
  stat_qq_line(color = "red", linewidth = 1) +
  labs(
    title = "Q–Q plot of residuals"
  ) +
  theme_minimal()

ggplot(resid_df, aes(x = fitted, y = resid)) +
  geom_point(alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs fitted values",
    x = "Fitted values",
    y = "Residuals"
  ) +
  theme_minimal()

### Test of residuals H0 rejected but because we have large N, the W is actually very close to 1…so its quite normal

set.seed(123)
shapiro.test(sample(resid_df$resid, 5000))

## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid_df$resid, 5000)
## W = 0.98611, p-value < 2.2e-16

Random effects normality

Mean is 0 and symetric distribution of the residual of the random effects

qqnorm(ranef(fit)$IQR_block[[1]])
qqline(ranef(fit)$IQR_block[[1]], col = "red")

Typology of Yield Performance

Typology of multi-season CA performance (23A–25B)

To describe how farms perform under CA over time, we created a typology of block-level responses using their relative yield (CA / CONV) across six seasons (23A–25B). Only blocks with data in at least four seasons were retained to ensure stable classification.

For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85

These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.

Notes: Average performance of CA is very bad, even if we target the environmetns or the districts with best performance, this blanket reccomednation would fail baddly. However,if we would be able to target the plots with good performance.. the rate of success of the reccomendatio could be high…for aprox 20% of the plots (i.e., 35% of the farmers…)

library(dplyr)
library(tidyr)

# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")

# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
  group_by(IQR_block) %>%
  summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
  filter(n_valid_seasons >= 4)

# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
  filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
         IQR_Season %in% seasons,
         is.finite(Y_ratio))

# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
  group_by(IQR_block, IQR_Season) %>%
  summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = IQR_Season,
    values_from = Y_ratio,
    names_prefix = "Y_ratio_"
  )

# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
  group_by(IQR_block) %>%
  summarise(
    n_seasons = n_distinct(IQR_Season),
    any_below_0.95 = any(Y_ratio < 0.95),
    mean_ratio = mean(Y_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    CA_typology = case_when(
      !any_below_0.95 ~ "Good",
      any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
      any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
      mean_ratio <= 0.85 ~ "Very Poor"
    )
  )

# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
  filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
  arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>%  # Prioritize 23A > 24A > 25A
  distinct(IQR_block, .keep_all = TRUE) %>%
  dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)


# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
  left_join(RY_wide, by = "IQR_block") %>%
  left_join(block_meta, by = "IQR_block")

Now we analyse the frequency of each type

Ahore por environment

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "CA-Performance Typology Distribution by OAF-AEZ",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

## What if we target the districts with the best performnace? wht could be the rate of success? Notes: Yes better chance but it would still fail in all

# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
                                levels = c("Very Poor", "Poor", "Acceptable", "Good"))

ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
  geom_bar(
    aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
    position = "stack"
  ) +
  facet_wrap(~ IQF_District) +
  labs(
    title = "CA-Performance Typology Distribution by District",
    x = "Performance Typology",
    y = "Percentage of Performance Typologies"
  ) +
  scale_fill_manual(values = c(
    "Very Poor" = "#e41a1c",
    "Poor" = "#ff7f00",
    "Acceptable" = "#377eb8",
    "Good" = "#4daf4a"
  )) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold")
  )

## Another way to see it

library(dplyr)
library(ggplot2)
library(glue)

# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
  filter(is.finite(Y_ratio)) %>%
  group_by(IQR_block, IQF_District) %>%
  summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 2. Summary for annotations
district_summary <- block_means %>%
  group_by(IQF_District) %>%
  summarise(
    avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
    n_total = n(),
    n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
    percent_ge1 = round(100 * n_ge1 / n_total)
  ) %>%
  mutate(
    label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
    y_pos = 1.5
  )

# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "gray90") +
  geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
  geom_text(
    data = district_summary,
    aes(x = IQF_District, y = y_pos, label = label),
    inherit.aes = FALSE,
    size = 3.5,
    vjust = 0
  ) +
  labs(
    title = "Distribution of Mean Y_ratio per Block by District",
    x = "District",
    y = "Mean Y_ratio per Block (across seasons)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# Drivers of yield performance We start with the database of yield drivers that was already processed in the scrips: “data_mm_block” but we change the relative

data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")

## Rows: 473 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_te...
## dbl (8): ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, ICR_arable_land_avg...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

drivers_data <- data_mm_block %>%
  mutate(
    CA_typology = fct_drop(CA_typology),

    IQR_block = factor(IQR_block),
    IQR_TF_tubura_client  = factor(IQR_TF_tubura_client),
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
    IQR_soil_texture      = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
    IQR_soil_color        = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
    IQF_position_slope    = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
    Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
  )

str(drivers_data)

## tibble [473 × 17] (S3: tbl_df/tbl/data.frame)
##  $ IQR_block              : Factor w/ 473 levels "10_1","10_10",..: 15 16 17 18 19 20 21 22 23 24 ...
##  $ ICR_Org_C_avg          : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
##  $ ICR_K_perc_23A         : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
##  $ ICR_Avail_P_avg        : num [1:473] 11.7 11.5 22 60.6 35.6 ...
##  $ ICR_arable_land_avg    : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
##  $ ICF_Alt_avg            : num [1:473] 1636 1293 1141 1207 1145 ...
##  $ ICR_rainfall_avg       : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
##  $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
##  $ ICF_planting_date      : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
##  $ IQR_TF_tubura_client   : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
##  $ IQR_plot_fert_quality  : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
##  $ IQR_soil_texture       : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ IQR_soil_color         : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
##  $ IQF_position_slope     : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
##  $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CA_typology            : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
##  $ IQF_environment        : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...

Now we paste the averaged relative yield along the 6 seasons to each bloc and we remove the of CA_typology

# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)

# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
  dplyr::group_by(IQR_block, IQR_Season) |>
  dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
  dplyr::group_by(IQR_block) |>
  dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |> 
  dplyr::left_join(avg_Y_ratio, by = "IQR_block")

We also add the performance variable CA_yupology”

library(dplyr)

# Merge CA_typology from Perf_Type using IQR_block as the key
drivers_data <- drivers_data |>
  left_join(Perf_Type |>
              dplyr::select(IQR_block, CA_typology),
            by = "IQR_block")

#Explore, visually the protential drivers and the relatio nwith Y_ration of the blocks ## Cathegoric variables Variables we have so far (after filtering some out and re-grouping cathegories with low N: ICR_Org_C_avg num: ICR_K_perc_23A num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICF_Alt_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” Output variables: avg_Y_ratio num: CA_typology factor:

CA Performance as affected by ICR_Org_C_avg

with avg_Y_ratio

Notes: when we look at all the database pooled together, there is no relation at all among SOC and performance of the CA. Howeever, where we split the analysis be Environment, then an interation becomes clear, where there is no relation in mid altitude and the relation is quite significant and positive in Highland and sligtly negative in transition. Conclusion. SOC should remain for the drivers analysis

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_Org_C_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

### What if we facet by environment? to see if interactions are masking the effect

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions (adjust if needed)
#----------------------------------------------
# Choose fixed position for annotation
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, lm line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Fit ANOVA model
model_anova <- aov(ICR_Org_C_avg ~ CA_typology, data = drivers_data)

# 2. Tukey HSD post-hoc test
tukey <- TukeyHSD(model_anova)

# 3. Extract group letters
letters_df <- multcompLetters(tukey$CA_typology[,"p adj"])$Letters
letters_df <- data.frame(CA_typology = names(letters_df), Letters = letters_df)

# 4. Calculate group means
means_df <- drivers_data %>%
  group_by(CA_typology) %>%
  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 5. Combine letters and means
plot_labels <- left_join(letters_df, means_df, by = "CA_typology")

# 6. Create boxplot with letters and means at fixed Y positions
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray", color = "black") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7.5, label = Letters),
            fontface = "bold", size = 5, color = "darkred") +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7, label = mean_label),
            fontface = "plain", size = 4, color = "black") +
  labs(
    title = "ICR_Org_C_avg by CA_typology",
    subtitle = "Letters indicate significant differences (Tukey HSD)",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +
  theme_minimal()

### What if we facet by Environment? myve some interactions masking the effect?

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------

get_letters <- function(df) {

  mod <- aov(ICR_Org_C_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)

  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------

ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +

  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

# Letters
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 9.5, label = Letters),
  inherit.aes = FALSE,
  fontface = "bold",
  size = 3,
  color = "darkred"
) +

# Means
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 8.25, label = mean_label),
  inherit.aes = FALSE,
  size = 3
) +


  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Org_C_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +

  theme_minimal()

## CA Performance as affected by ICR_K_perc_23A ### with avg_Y_ratio Notes: there is some minimal relation in highland. When we look at the correlation among SOC and K, its relatively high in highland. So we could just use SOC in the drivers analysis and exclude K Conclusion: Keep SCO but explcude K

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_K_perc_23A), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_K_perc_23A, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_K_perc_23A, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_K_perc_23A vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Potassium (%) in 23A",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

### with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_K_perc_23A ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_K_perc_23A)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_K_perc_23A)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 3.2, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2.8, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_K_perc_23A by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Potassium (%) in 23A"
  ) +

  theme_minimal()

## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Since SOC semms to have some explanatory power and K very little, lets see the correlation among them Notes: there is quite a correlation among K and SOC in highland

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_K_perc_23A), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_K_perc_23A ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_K_perc_23A)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Potassium (23A)",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_K_perc_23A (Potassium %)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## CA Performance as affected by ICR_Avail_P_avg ### with avg_Y_ratio NOtes: P seems to have some explanatory power in 2 of the 4 environemnts and the correlation with SOC is not strong. Conclusions: keep it for the drivers analysis

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Avail_P_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Avail_P_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Avail_P_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Avail_P_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Available Phosphorus (ppm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###ICR_Avail_P_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_Avail_P_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_Avail_P_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Avail_P_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 110, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 100, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Avail_P_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Available Phosphorus (ppm)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs. ICR_Avail_P_avg

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_Avail_P_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_Avail_P_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_Avail_P_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Available Phosphorus",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_Avail_P_avg (Phosphorus ppm)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## CA Performance as affected by ICR_arable_land_avg

NOtes: no correlation when pooled together but a trant in 2 out of 4 environments. Moreover it is a hosehold variable that is ususally used to explain perfomrance of practices Conclusions: Keep it for drivers analysis

with avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_arable_land_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    x = "Arable Land (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

### In facet by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_arable_land_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_arable_land_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Arable land (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###ICR_arable_land_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

get_letters <- function(df) {
  mod <- aov(ICR_arable_land_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_arable_land_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_arable_land_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

ggplot(drivers_data, aes(x = CA_typology, y = ICR_arable_land_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 90, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 82, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_arable_land_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Arable land (%)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs ICR_arable_land_avg

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_arable_land_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_arable_land_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

label_y <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_arable_land_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Arable Land",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_arable_land_avg (Arable land %)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## CA Performance as affected by ICF_Alt_avg

NOtes: there seems to be no correlation when pooled together and even less when split by environment. Environment clasiffication is mainly based on altitude so if we are using em=nvironments as driver, altitude can be eliminated Conclusions: Do not use further in drivers analysis

###All together

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_Alt_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    x = "Altitude (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

with avg_Y_ratio

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICF_Alt_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_Alt_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICF_Alt_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Altitude (m)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###ICF_Alt_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function for letters
get_letters <- function(df) {
  mod <- aov(ICF_Alt_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICF_Alt_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_Alt_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 0)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICF_Alt_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2000, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 1800, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Altitude (m)"
  ) +
  theme_minimal()

CA Performance as affected by ICR_rainfall_avg

NOtes: Rain seems to partially explain CA performance, with a cuadratice-model response when pooled all environmetns together, with 4 mm per day as optimumn. However, when split by environments, linear models can fit as good Conclusions: Keep, consider cuadratic relation and not linear if not grouped within environments ###Simple regression: ICR_rainfall_avg vs avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg, data = drivers_data)

# 2. Extract p-value and R-squared from model summary
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5, color = "black") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

### What wbout tieh a cuadratic model?

library(ggplot2)

# 1. Fit quadratic model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2),
            data = drivers_data)

# 2. Extract p-value and R²
p_val <- summary(model)$coefficients["I(ICR_rainfall_avg^2)", 4]
r2 <- summary(model)$r.squared

# 3. Create plot with p and R²
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
annotate("text", 
         x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
         label = paste0("p = ", signif(p_val, 3),
                        "\nR^2 = ", round(r2, 2)),
         size = 5)

###Faceted regression: ICR_rainfall_avg vs avg_Y_ratio by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit separate linear models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

# 2. Annotation positions
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_rainfall_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

### same as avobe but with cuadratic model

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit quadratic models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2), data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients["I(ICR_rainfall_avg^2)", 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),

    # IMPORTANT: use R^2 (not R²)
    stat_label = paste0(
      "p = ", signif(p_value, 3),
      "\nR^2 = ", round(r_squared, 2)
    )
  ) %>%
  dplyr::select(IQF_environment, stat_label)


stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
labs(
  title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
  subtitle = "Quadratic regressions per environment with p-values and R^2",
  x = "Rainfall (avg, mm)",
  y = "Average Yield Ratio"
)

###Boxplot: ICR_rainfall_avg by CA_typology (with Tukey HSD letters)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function to extract Tukey letters
get_letters <- function(df) {
  mod <- aov(ICR_rainfall_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_rainfall_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_rainfall_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICR_rainfall_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 7, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 6.2, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Rainfall (avg, mm)"
  ) +
  theme_minimal()

## CA Performance as affected by Comp_amount_corr_quali

NOtes: It seems to have a significant relation with both Y_relative and typology. Wether pooled all together of per environment, but when split by environment, all show the sam esign of trend, but is clear and significant only for mid-dry and transition Conclusions: Keep for the drivers analysis ###Simple regression: Comp_amount_corr_quali vs avg_Y_ratio

library(ggplot2)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                Comp_amount_corr_quali <= 15)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    x = "Compost amount (corrected, ≤15)",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###Linear regression per environment (facet + R² + p, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment (compost ≤ 15)",
    x = "Compost amount (corrected)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###Boxplot by CA_typology (Tukey HSD, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(Comp_amount_corr_quali),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

get_letters <- function(df) {
  mod <- aov(Comp_amount_corr_quali ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(Comp_amount_corr_quali, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

ggplot(df_filt, aes(x = CA_typology, y = Comp_amount_corr_quali)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali by CA_typology (per environment)",
    subtitle = "Tukey HSD (compost ≤ 15)",
    x = "CA Typology",
    y = "Compost amount (corrected)"
  ) +
  theme_minimal()

## CA Performance as affected by ICF_planting_date

NOtes: Conclusions: ###Simple regression: ICR_rainfall_avg ICF_planting_date

library(ggplot2)
library(dplyr)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio))

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_planting_date, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###Linear regression per environment (facet + R^2 + p)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment))

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

###Boxplot by CA_typology (Tukey HSD)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(ICF_planting_date),
                !is.na(IQF_environment))

get_letters <- function(df) {
  mod <- aov(ICF_planting_date ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_planting_date, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))

## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.

ggplot(df_filt, aes(x = CA_typology, y = ICF_planting_date)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date by CA_typology (per environment)",
    subtitle = "Tukey HSD",
    x = "CA Typology",
    y = "Planting date"
  ) +
  theme_minimal()

###How does delay in planting relate to rain over the season?

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(ICF_planting_date),
    !is.na(ICR_rainfall_avg),
    !is.na(IQF_environment)
  )

# 2. Fit linear model per environment
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_rainfall_avg ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

# 3. Annotation position
label_y <- max(df_filt$ICR_rainfall_avg, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 4. Plot
ggplot(df_filt, aes(x = ICF_planting_date, y = ICR_rainfall_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 4,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. ICR_rainfall_avg",
    subtitle = "Linear regression per environment (p-value and R^2 shown)",
    x = "Planting date",
    y = "Rainfall average (mm)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

##Factorial variables ## CA Performance as affected by IQR_TF_tubura_client NOtes: Conclusions: ###IQR_TF_tubura_client with Y_ratio

library(ggplot2)
library(dplyr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_TF_tubura_client),
    !IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_TF_tubura_client = droplevels(IQR_TF_tubura_client))  # drop unused levels

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_test <- kruskal.test(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
kruskal_p <- kruskal_test$p.value

# 4. Build annotation label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Calculate group means
mean_df <- df_filt %>%
  group_by(IQR_TF_tubura_client) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Plot boxplot with p-values and means
ggplot(df_filt, aes(x = IQR_TF_tubura_client, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Means under each box
  geom_text(
    data = mean_df,
    aes(x = IQR_TF_tubura_client,
        y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
        label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # P-value annotation
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_TF_tubura_client",
    subtitle = "ANOVA and Kruskal–Wallis test results shown",
    x = "Tubura client status",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_TF_tubura_client with CA_typology

CA Performance as affected by IQR_plot_fert_quality

NOtes: Conclusions: ###IQR_plot_fert_quality with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_plot_fert_quality),
    !IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_plot_fert_quality = factor(IQR_plot_fert_quality, levels = c("low_quality", "medium", "high")))


# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Tukey HSD and Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_plot_fert_quality`[, "p adj"])$Letters

letters_df <- data.frame(
  IQR_plot_fert_quality = names(letters),
  Letters = letters,
  stringsAsFactors = FALSE
)

# 4. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)$p.value
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means and letters
labels_df <- left_join(means_df, letters_df, by = "IQR_plot_fert_quality")

# 7. Plot
ggplot(df_filt, aes(x = IQR_plot_fert_quality, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Add Tukey letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Add means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Add test label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_plot_fert_quality",
    subtitle = "Tukey HSD letters, means, and statistical tests",
    x = "Plot Fertility Quality",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_plot_fert_quality with CA_typology

CA Performance as affected by IQR_soil_texture

NOtes: Conclusions: ###IQR_soil_texture with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_texture),
    !IQR_soil_texture %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_soil_texture = droplevels(IQR_soil_texture))

# Optional: Reorder texture levels manually (if desired)
# Example order: "sand", "loam", "clay", etc.
df_filt <- df_filt %>%
  mutate(IQR_soil_texture = factor(IQR_soil_texture, levels = c("Fine", "Mix", "Coarse")))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)$p.value

# 4. Tukey HSD and compact letter display
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_texture`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_texture = names(letters), Letters = letters)

# 5. Compute group means
means_df <- df_filt %>%
  group_by(IQR_soil_texture) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_texture")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_texture, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Statistical tests label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_texture",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Texture",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_soil_texture with CA_typology

CA Performance as affected by IQR_soil_color

NOtes: Conclusions: ###IQR_soil_color with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_color),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  ) %>%
  mutate(IQR_soil_color = droplevels(IQR_soil_color))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = df_filt)$p.value

# 4. Tukey HSD and letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_color`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_color = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_soil_color) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters and means
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_color")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_color",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Color",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_soil_color with CA_typology

CA Performance as affected by IQF_position_slope

NOtes: Conclusions: ###IQF_position_slope with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_position_slope),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_position_slope = droplevels(IQF_position_slope))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_position_slope`[, "p adj"])$Letters
letters_df <- data.frame(IQF_position_slope = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_position_slope) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQF_position_slope")

# 7. Statistical test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Tukey HSD letters (above)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means (below)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_position_slope",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Slope Position",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQF_position_slope with CA_typology

CA Performance as affected by Weed_species_A_combined

NOtes: Conclusions: ###Weed_species_A_combined with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(Weed_species_A_combined),
    !Weed_species_A_combined %in% c("no data", "-", "—", "NA")
  ) %>%
  mutate(Weed_species_A_combined = droplevels(Weed_species_A_combined))


# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis
kruskal_p <- kruskal.test(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`Weed_species_A_combined`[, "p adj"])$Letters
letters_df <- data.frame(Weed_species_A_combined = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(Weed_species_A_combined) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "Weed_species_A_combined")

# 7. Combine test labels
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = Weed_species_A_combined, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by Weed_species_A_combined",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Weed Species (Combined)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Weed_species_A_combined with CA_typology

CA Performance as affected by IQF_environment

NOtes: Conclusions: ###IQF_environment with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_environment),
    !IQF_environment %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_environment = droplevels(factor(IQF_environment)))
df_filt <- df_filt %>%
  mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_environment, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_environment, data = df_filt)$p.value

# 4. Tukey HSD + Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_environment`[, "p adj"])$Letters
letters_df <- data.frame(IQF_environment = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_environment) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters + means
labels_df <- left_join(means_df, letters_df, by = "IQF_environment")

# 7. Test summary
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_environment, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_environment",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Environment Type",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQF_environment with CA_typology

Re-Analysis after 1st meeting with Ben

Diego

2026-02-06

Data pre-Cleaning

Explore how database is reduced if we only keep trat A and C

Oulier detection

Prepare data

Extract residuals

COunt outliers

Visualization of ouliers

Found some plots with yield <0.1 tn/ha explore

Final clan data = ca_AC_clean

Create variables of crop and year and block_season combination

Yield and profit comparions exploration

% COnv>CA per soason-environment_Yield

Now we evaluate the relation among EI and evonomic and yield avdantage of CA..for amize and beans separated

Does the % wins of CA increse progresibely from season 1 to 6? Separated by crop

Mix models to evaluate treatment effect in absolute yield values, the interactions and the destribution of the residuals

Mix model and interactions

Visual of CA:CONV be season-environemnt convination

Random effects normality

Typology of Yield Performance

Typology of multi-season CA performance (23A–25B)

Ahore por environment

Now we paste the averaged relative yield along the 6 seasons to each bloc and we remove the of CA_typology

We also add the performance variable CA_yupology”

CA Performance as affected by ICR_Org_C_avg

with avg_Y_ratio

with CA_typology

with avg_Y_ratio

with avg_Y_ratio

CA Performance as affected by ICR_rainfall_avg

CA Performance as affected by IQR_plot_fert_quality

CA Performance as affected by IQR_soil_texture

CA Performance as affected by IQR_soil_color

CA Performance as affected by IQF_position_slope

CA Performance as affected by Weed_species_A_combined

CA Performance as affected by IQF_environment