library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
library(lme4)         # for lmer()
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(emmeans)      # for estimated marginal means
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(ggplot2)      # for plotting
library(dplyr)        # for data wrangling
library(tidyr)
library(broom)



CONV_CA_Ratio <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/CONV_CA_Ratio.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2818 Columns: 105
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (40): IQR_Season, Seas_block_treat, IQR_SeasAB, Sample code, IQF_agzone,...
## dbl (64): IQR_TF_tubura_client, ICR_age_hh_head, ICR_HH_size, ICR_arable_lan...
## lgl  (1): IQR_weed1_complexity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)
# Mix model to evaluate effect of crop, season, year and interactions on CA:CONV ratio Description of the analysis: Linear mixed-effects model fitted to the log-transformed yield ratio (log(Y_ratio + 1)) comparing Conservation Agriculture (CA) to Conventional Agriculture (CONV). The model included:Fixed effects: crop, year, IQF_environment, and all two- and three-way interactions Random effect: IQR_block The treatment variable (Treat_code) was excluded in this model (fit2_conv) Model performance was compared to a simpler model (fit1_conv) using BIC. The summary of fixed effects and significance tests were obtained via ANOVA with Satterthwaite approximation.
Description of the results: On the analysis of the log-transformed CA:CONV yield ratio, there was a significant effect of crop, year, and environment, as well as their two- and three-way interactions.
r # Model 1: no log transform, no Treat_code fit1_conv <- lmer( Y_ratio ~ crop + year + IQR_Season + IQF_environment + IQR_Season:IQF_environment + (1 | IQR_block), data = CONV_CA_Ratio )
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
```r # Model 2: log transform, no Treat_code fit2_conv <- lmer( log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment + crop:IQF_environment + year:IQF_environment + year:crop:IQF_environment + year:crop + (1 | IQR_block), data = CONV_CA_Ratio )
# Model comparison BIC(fit1_conv, fit2_conv) ```
## df BIC ## fit1_conv 26 2079.005 ## fit2_conv 18 -1766.859
r # View model summaries summary(fit1_conv)
## Linear mixed model fit by REML ['lmerMod'] ## Formula: ## Y_ratio ~ crop + year + IQR_Season + IQF_environment + IQR_Season:IQF_environment + ## (1 | IQR_block) ## Data: CONV_CA_Ratio ## ## REML criterion at convergence: 1872.5 ## ## Scaled residuals: ## Min 1Q Median 3Q Max ## -2.5822 -0.5427 -0.0954 0.4009 9.8223 ## ## Random effects: ## Groups Name Variance Std.Dev. ## IQR_block (Intercept) 0.01826 0.1351 ## Residual 0.09644 0.3105 ## Number of obs: 2818, groups: IQR_block, 565 ## ## Fixed effects: ## Estimate Std. Error t value ## (Intercept) 2.75394 1.07417 2.564 ## cropMaize -0.09917 0.08647 -1.147 ## year -0.07516 0.04296 -1.749 ## IQR_Season23B -0.09980 0.09560 -1.044 ## IQR_Season24A -0.24542 0.05258 -4.668 ## IQR_Season24B -0.34972 0.06028 -5.802 ## IQF_environmentMid_Low_Dry -0.02494 0.03979 -0.627 ## IQF_environmentMid_Low_Wet -0.01160 0.04143 -0.280 ## IQF_environmentTransition 0.01562 0.04527 0.345 ## IQR_Season23B:IQF_environmentMid_Low_Dry -0.24710 0.05346 -4.622 ## IQR_Season24A:IQF_environmentMid_Low_Dry 0.26474 0.05466 4.843 ## IQR_Season24B:IQF_environmentMid_Low_Dry 0.19055 0.05409 3.523 ## IQR_Season25A:IQF_environmentMid_Low_Dry -0.06885 0.09277 -0.742 ## IQR_Season25B:IQF_environmentMid_Low_Dry -0.01160 0.05507 -0.211 ## IQR_Season23B:IQF_environmentMid_Low_Wet -0.06178 0.05539 -1.115 ## IQR_Season24A:IQF_environmentMid_Low_Wet 0.24877 0.05706 4.360 ## IQR_Season24B:IQF_environmentMid_Low_Wet 0.15245 0.05622 2.712 ## IQR_Season25A:IQF_environmentMid_Low_Wet 0.06260 0.09438 0.663 ## IQR_Season25B:IQF_environmentMid_Low_Wet 0.05454 0.05756 0.948 ## IQR_Season23B:IQF_environmentTransition -0.10401 0.06086 -1.709 ## IQR_Season24A:IQF_environmentTransition 0.17929 0.06288 2.851 ## IQR_Season24B:IQF_environmentTransition 0.15105 0.06199 2.437 ## IQR_Season25A:IQF_environmentTransition 0.03154 0.09673 0.326 ## IQR_Season25B:IQF_environmentTransition 0.04978 0.06287 0.792
## ## Correlation matrix not shown by default, as p = 24 > 12. ## Use print(x, correlation=TRUE) or ## vcov(x) if you need it
## fit warnings: ## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
r summary(fit2_conv)
## Linear mixed model fit by REML ['lmerMod'] ## Formula: log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment + ## crop:IQF_environment + year:IQF_environment + year:crop:IQF_environment + ## year:crop + (1 | IQR_block) ## Data: CONV_CA_Ratio ## ## REML criterion at convergence: -1909.8 ## ## Scaled residuals: ## Min 1Q Median 3Q Max ## -3.2810 -0.5815 -0.0355 0.4949 6.5777 ## ## Random effects: ## Groups Name Variance Std.Dev. ## IQR_block (Intercept) 0.005424 0.07365 ## Residual 0.024837 0.15760 ## Number of obs: 2818, groups: IQR_block, 565 ## ## Fixed effects: ## Estimate Std. Error t value ## (Intercept) 1.10145 0.25968 4.242 ## cropMaize 2.18031 0.47356 4.604 ## year -0.02227 0.01083 -2.057 ## IQF_environmentMid_Low_Dry -1.71564 0.33938 -5.055 ## IQF_environmentMid_Low_Wet -0.79508 0.35516 -2.239 ## IQF_environmentTransition -0.98950 0.38866 -2.546 ## cropMaize:year -0.09316 0.01999 -4.660 ## cropMaize:IQF_environmentMid_Low_Dry 0.43874 0.56254 0.780 ## cropMaize:IQF_environmentMid_Low_Wet -1.24660 0.58205 -2.142 ## cropMaize:IQF_environmentTransition -0.88133 0.61635 -1.430 ## year:IQF_environmentMid_Low_Dry 0.07026 0.01415 4.965 ## year:IQF_environmentMid_Low_Wet 0.03404 0.01482 2.298 ## year:IQF_environmentTransition 0.04245 0.01621 2.619 ## cropMaize:year:IQF_environmentMid_Low_Dry -0.01417 0.02366 -0.599 ## cropMaize:year:IQF_environmentMid_Low_Wet 0.05497 0.02448 2.245 ## cropMaize:year:IQF_environmentTransition 0.03960 0.02589 1.530
## ## Correlation matrix not shown by default, as p = 16 > 12. ## Use print(x, correlation=TRUE) or ## vcov(x) if you need it
r # Fixed effects significance anova(fit2_conv)
## Analysis of Variance Table ## npar Sum Sq Mean Sq F value ## crop 1 0.33073 0.33073 13.3159 ## year 1 0.35808 0.35808 14.4170 ## IQF_environment 3 0.50845 0.16948 6.8239 ## crop:year 1 1.86809 1.86809 75.2142 ## crop:IQF_environment 3 0.44410 0.14803 5.9602 ## year:IQF_environment 3 0.81956 0.27319 10.9992 ## crop:year:IQF_environment 3 0.39517 0.13172 5.3036
r # Residual diagnostics plot(fit2_conv)
r qqnorm(resid(fit2_conv)); qqline(resid(fit2_conv))
# Description of the performance Description of the analysis: Used eemeans from the mixed-effects model to summarize the overall CA:CONV yield ratio, and to assess the effects of crop, year, and environment. Results were back-transformed for interpretation on the original yield ratio scale. Interaction effects were explored and visualized.
Description of the results: The model’s estimated marginal means show that CONV outperformed CA overall, with an average CA:CONV yield ratio of 0.787. This is, on average, CA yields were approximately 21% lower than CONV yields across all observations. The advantage of CONV was similar for both maize and beans, as supported by the non-significant difference in marginal means between crops (p = 0.7). While year had a statistically significant effect, there was no consistent trend suggesting that CA performance improved from year 1 to 3. In contrast, environment played a clearer role: “Transition” and “Mid-Low-Wet” environments were associated with relatively higher CA performance, suggesting that these conditions may be more favorable for Conservation Agriculture. Conventional outperforms CA as a whole experimental average by (overall CA:CONV ratio = 0.787 (eemeans for the same model shown above). Advantage of CONV was similar for maize and beans (t-tests of individual parameter estimates p=0.7). Although there was a significant effect of year, there was no trend for CA to perform better from year 1 to 3. Also, there was a clear effect of the environment with transition and mid-low-wet being more favourable for CA Although there was a significant effect of year, there was no trend for CA to perform better from year 1 to 3. Also, there was a clear effect of the environment with transition and mid-low-wet being more favourable for CA ## Overal difference
r overall_emm <- emmeans(fit2_conv, ~ 1, type = "response") print(overall_emm)
## 1 response SE df lower.CL upper.CL ## overall 0.779 0.0081 580 0.763 0.795 ## ## Results are averaged over the levels of: crop, IQF_environment ## Degrees-of-freedom method: kenward-roger ## Confidence level used: 0.95 ## Intervals are back-transformed from the log(mu + 1) scale ## eemeans to interpret the effects of categorical predictors and their interactions
```r library(emmeans)
# Compute emmeans for crop within year × environment emm_crop_year_env <- emmeans(fit2_conv, ~ crop | year * IQF_environment, type = “response”) emm_df <- as.data.frame(emm_crop_year_env) ```
## Visualization of results
r ggplot(emm_df, aes(x = IQF_environment, y = response, color = crop)) + geom_point(position = position_dodge(0.4), size = 3) + geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), position = position_dodge(0.4), width = 0.2) + facet_wrap(~ year, nrow = 1) + labs( title = "Estimated Yield Ratio (CA / CONV) by Crop, Year, and Environment", x = "Environment Type", y = "Estimated Y_ratio (CA : CONV)", color = "Crop" ) + theme_minimal(base_size = 14) + theme( axis.text.x = element_text(size = 8, angle = 45, hjust = 1) )
## Does the % wins of CA increse progresibely from year 1 to 3? Separated by crop
```r library(dplyr) library(ggplot2) library(broom) library(tidyr)
# —————————– # 1. Prepare data # —————————– trend_data <- CONV_CA_Ratio %>% filter( is.finite(Y_ratio), year %in% c(“23”, “24”, “25”), IQR_SeasAB %in% c(“A”, “B”) ) %>% mutate( year = as.numeric(year), crop_type = ifelse(IQR_SeasAB == “A”, “Maize”, “Beans”) )
# —————————– # 2. Function to compute slope counts # —————————– get_slope_label <- function(df) { # One slope per block slopes <- df %>% group_by(IQR_block) %>% filter(n_distinct(year) >= 2) %>% summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = “drop”) %>% mutate( direction = case_when( slope > 0 ~ “↑ Positive”, slope < 0 ~ “↓ Negative”, TRUE ~ “= Flat” ) ) %>% count(direction) %>% complete(direction, fill = list(n = 0))
# Format label text paste0( “↑ Positive:”, slopes\(n[slopes\)direction == “↑ Positive”], “”, “↓ Negative:”, slopes\(n[slopes\)direction == “↓ Negative”], “”, “= Flat:”, slopes\(n[slopes\)direction == “= Flat”] ) }
# Labels label_maize <- get_slope_label(trend_data %>% filter(crop_type == “Maize”)) label_beans <- get_slope_label(trend_data %>% filter(crop_type == “Beans”))
# Common y-limit helper y_max <- max(trend_data$Y_ratio, na.rm = TRUE)
# —————————– # 3. MAIZE plot # —————————– p_maize <- ggplot( trend_data %>% filter(crop_type == “Maize”), aes(x = year, y = Y_ratio) ) + geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends geom_smooth( aes(group = IQR_block), method = “lm”, se = FALSE, color = “steelblue”, linewidth = 0.6 ) +
# Global trend geom_smooth( method = “lm”, se = FALSE, color = “red”, linewidth = 1.3 ) +
# Annotation annotate( “label”, x = 23.05, y = y_max * 0.95, label = label_maize, hjust = 0, vjust = 1, size = 4.5, fontface = “bold”, fill = “white”, label.size = 0.4 ) +
scale_x_continuous(breaks = c(23, 24, 25)) + labs( title = “Block-level and Overall Linear Trends of Y_ratio (Maize)”, x = “Year”, y = “Relative Yield (Y_ratio)” ) + theme_minimal() + theme( plot.title = element_text(face = “bold”, hjust = 0.5, size = 12) ) ```
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_maize, ## : Ignoring unknown parameters: `label.size`
```r # —————————– # 4. BEANS plot # —————————– p_beans <- ggplot( trend_data %>% filter(crop_type == “Beans”), aes(x = year, y = Y_ratio) ) + geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends geom_smooth( aes(group = IQR_block), method = “lm”, se = FALSE, color = “darkorange”, linewidth = 0.6 ) +
# Global trend geom_smooth( method = “lm”, se = FALSE, color = “red”, linewidth = 1.3 ) +
# Annotation annotate( “label”, x = 23.05, y = y_max * 0.95, label = label_beans, hjust = 0, vjust = 1, size = 4.5, fontface = “bold”, fill = “white”, label.size = 0.4 ) +
scale_x_continuous(breaks = c(23, 24, 25)) + labs( title = “Block-level and Overall Linear Trends of Y_ratio (Beans)”, x = “Year”, y = “Relative Yield (Y_ratio)” ) + theme_minimal() + theme( plot.title = element_text(face = “bold”, hjust = 0.5, size = 12) ) ```
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_beans, ## : Ignoring unknown parameters: `label.size`
r # ----------------------------- # 5. Print plots # ----------------------------- print(p_maize)
## `geom_smooth()` using formula = 'y ~ x' ## `geom_smooth()` using formula = 'y ~ x'
r print(p_beans)
## `geom_smooth()` using formula = 'y ~ x' ## `geom_smooth()` using formula = 'y ~ x'
# What if we target the environment or district? Description of the analysis: We first classified each block into performance typologies (Very Poor to Good) based on its CA:CONV yield ratio across seasons.For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85 These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.
We then visualized the distribution of these typologies by environment and district. Finally, we summarized and plotted the average relative yield per block, aggregated by district, to highlight areas where CA performed relatively better.
Description of the results: Even if we look at the district level, the best districts have a poor CA performance on average. The district where CA performed better (as an average from the 6 seasons) was Gisagara, and even there, the average CA:CONV ratio was 0.95.
## Yield performance as a qualitative evaluation of multiple seasons
```r library(dplyr) library(tidyr)
# Step 1: Define target seasons seasons <- c(“23A”, “23B”, “24A”, “24B”, “25A”, “25B”)
# Step 2: Compute number of valid seasons per block blocks_with_enough_seasons <- CONV_CA_Ratio %>% filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>% group_by(IQR_block) %>% summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = “drop”) %>% filter(n_valid_seasons >= 4)
# Step 3: Filter main dataset to include only blocks with ≥4 seasons filtered_yield <- CONV_CA_Ratio %>% filter(IQR_block %in% blocks_with_enough_seasons$IQR_block, IQR_Season %in% seasons, is.finite(Y_ratio))
# Step 4: Prepare wide-format Relative Yield table RY_wide <- filtered_yield %>% group_by(IQR_block, IQR_Season) %>% summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = “drop”) %>% pivot_wider( names_from = IQR_Season, values_from = Y_ratio, names_prefix = “Y_ratio_” )
# Step 5: Compute performance typology Perf_Type <- filtered_yield %>% group_by(IQR_block) %>% summarise( n_seasons = n_distinct(IQR_Season), any_below_0.95 = any(Y_ratio < 0.95), mean_ratio = mean(Y_ratio, na.rm = TRUE), .groups = “drop” ) %>% mutate( CA_typology = case_when( !any_below_0.95 ~ “Good”, any_below_0.95 & mean_ratio >= 0.95 ~ “Acceptable”, any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ “Poor”, mean_ratio <= 0.85 ~ “Very Poor” ) )
# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons block_meta <- CONV_CA_Ratio %>% filter(IQR_Season %in% c(“23A”, “24A”, “25A”)) %>% arrange(IQR_block, match(IQR_Season, c(“23A”, “24A”, “25A”))) %>% # Prioritize 23A > 24A > 25A distinct(IQR_block, .keep_all = TRUE) %>% dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)
# Step 7: Join all into Perf_Type Perf_Type <- Perf_Type %>% left_join(RY_wide, by = “IQR_block”) %>% left_join(block_meta, by = “IQR_block”) ```
## Yield performance by environmetn
```r # Set factor levels for consistent order Perf_Type\(CA_typology <- factor(Perf_Type\)CA_typology, levels = c(“Very Poor”, “Poor”, “Acceptable”, “Good”))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) + geom_bar( aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)), position = “stack” ) + facet_wrap(~ IQF_environment) + labs( title = “CA-Performance Typology Distribution by OAF-AEZ”, x = “Performance Typology”, y = “Percentage of Performance Typologies” ) + scale_fill_manual(values = c( “Very Poor” = “#e41a1c”, “Poor” = “#ff7f00”, “Acceptable” = “#377eb8”, “Good” = “#4daf4a” )) + theme_minimal() + theme( legend.position = “none”, strip.text = element_text(face = “bold”) ) ```
## Yield performance by districy
```r # Set factor levels for consistent order Perf_Type\(CA_typology <- factor(Perf_Type\)CA_typology, levels = c(“Very Poor”, “Poor”, “Acceptable”, “Good”))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) + geom_bar( aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)), position = “stack” ) + facet_wrap(~ IQF_District) + labs( title = “CA-Performance Typology Distribution by District”, x = “Performance Typology”, y = “Percentage of Performance Typologies” ) + scale_fill_manual(values = c( “Very Poor” = “#e41a1c”, “Poor” = “#ff7f00”, “Acceptable” = “#377eb8”, “Good” = “#4daf4a” )) + theme_minimal() + theme( legend.position = “none”, strip.text = element_text(face = “bold”) ) ```
## Average yiel performance across seasons per district
r library(dplyr) library(ggplot2) library(glue)
## Warning: package 'glue' was built under R version 4.3.3
```r # 1. Mean Y_ratio per block (across seasons) block_means <- CONV_CA_Ratio %>% filter(is.finite(Y_ratio)) %>% group_by(IQR_block, IQF_District) %>% summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = “drop”)
# 2. Summary for annotations district_summary <- block_means %>% group_by(IQF_District) %>% summarise( avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2), n_total = n(), n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE), percent_ge1 = round(100 * n_ge1 / n_total) ) %>% mutate( label = glue(“≥1: {percent_ge1}%= {avg_ratio}”), y_pos = 1.5 )
# 3. Plot ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) + geom_boxplot(outlier.shape = NA, fill = “gray90”) + geom_jitter(width = 0.2, alpha = 0.5, color = “steelblue”, size = 2) + geom_text( data = district_summary, aes(x = IQF_District, y = y_pos, label = label), inherit.aes = FALSE, size = 3.5, vjust = 0 ) + labs( title = “Distribution of Mean Y_ratio per Block by District”, x = “District”, y = “Mean Y_ratio per Block (across seasons)” ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(face = “bold”, hjust = 0.5) ) ```
# Now we look at the individual plts rather than averages Description of the analysis: We analyzed the block-level average performance of Conservation Agriculture (CA) relative to Conventional Agriculture (CONV) by calculating the mean CA:CONV yield ratio per block across all seasons. A cumulative distribution plot was used to visualize how many blocks reached or exceeded yield parity (Y_ratio ≥ 1). Additionally, density plots by agro-environment visualized how CA performance varied across zones.
Description of the results: As we showed previously, when examining the average performance of CA at the country, environment, or district level, results are discouraging, suggesting that a blanket recommendation of this package is unlikely to generate broad impact or adoption, even if we target the best environmetn or district. However, when zooming in to the plot level, there are a substantial number of cases where CA either outperforms CONV or has a similar performance. Specifically, CA outperforms CONV in 18% of blocks, and in 21% of cases achieves a yield ratio close to or above acceptable thresholds (Shown before in the figure of Performance Typologies)
```r library(dplyr) library(ggplot2)
# 1. Summarize data to block level block_data <- CONV_CA_Ratio %>% filter(is.finite(Y_ratio)) %>% group_by(IQR_block) %>% summarise( avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), IQF_environment = first(IQF_environment), # to retain for optional grouping .groups = “drop” ) %>% arrange(avg_Y_ratio) %>% mutate(cum_percent = 100 * (row_number() / n()))
# 2. Plot cumulative distribution ggplot(block_data, aes(x = avg_Y_ratio, y = cum_percent)) + geom_point(size = 2, alpha = 0.7, color = “darkorange”) + geom_vline(xintercept = 1, linetype = “dashed”, color = “blue”) + scale_x_continuous(limits = c(0, 2.5)) + scale_y_continuous(breaks = seq(0, 100, by = 10)) + labs( x = “Block-Level Average Yield Ratio (CA / CONV)”, y = “Cumulative % of Blocks”, title = “Cumulative Distribution of Relative Yield (CA vs CONV)”, subtitle = “Each point is a block’s average over all seasons; Dashed line = yield parity”, ) + theme_minimal() + theme( plot.title = element_text(face = “bold”, hjust = 0.5), plot.subtitle = element_text(size = 10, hjust = 0.5) ) ```
##Visual distribution of CONV advantage vs CA_Yield
```r library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) + geom_density(alpha = 0.3) + geom_vline(xintercept = 1, linetype = “dashed”) + labs(title = “Density of Relative Yield (CA / CONV) by Agzone”) ```

Drivers of yield performance

We start with the database of yield drivers that was already processed in the scrips: “data_mm_block” but we change the relative

data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")
## Rows: 473 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_te...
## dbl (8): ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, ICR_arable_land_avg...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drivers_data <- data_mm_block %>%
  mutate(
    CA_typology = fct_drop(CA_typology),

    IQR_block = factor(IQR_block),
    IQR_TF_tubura_client  = factor(IQR_TF_tubura_client),
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
    IQR_soil_texture      = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
    IQR_soil_color        = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
    IQF_position_slope    = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
    Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
  )

str(drivers_data)
## tibble [473 × 17] (S3: tbl_df/tbl/data.frame)
##  $ IQR_block              : Factor w/ 473 levels "10_1","10_10",..: 15 16 17 18 19 20 21 22 23 24 ...
##  $ ICR_Org_C_avg          : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
##  $ ICR_K_perc_23A         : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
##  $ ICR_Avail_P_avg        : num [1:473] 11.7 11.5 22 60.6 35.6 ...
##  $ ICR_arable_land_avg    : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
##  $ ICF_Alt_avg            : num [1:473] 1636 1293 1141 1207 1145 ...
##  $ ICR_rainfall_avg       : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
##  $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
##  $ ICF_planting_date      : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
##  $ IQR_TF_tubura_client   : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
##  $ IQR_plot_fert_quality  : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
##  $ IQR_soil_texture       : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ IQR_soil_color         : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
##  $ IQF_position_slope     : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
##  $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CA_typology            : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
##  $ IQF_environment        : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...

Now we paste the averaged relative yield along the 6 seasons to each bloc and we remove the of CA_typology

# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)

# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
  dplyr::group_by(IQR_block, IQR_Season) |>
  dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
  dplyr::group_by(IQR_block) |>
  dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")

# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |> 
  dplyr::left_join(avg_Y_ratio, by = "IQR_block")

We also add the performance variable CA_yupology”

library(dplyr)

# Merge CA_typology from Perf_Type using IQR_block as the key
drivers_data <- drivers_data |>
  left_join(Perf_Type |>
              dplyr::select(IQR_block, CA_typology),
            by = "IQR_block")

#Explore, visually the protential drivers and the relatio nwith Y_ration of the blocks ## Cathegoric variables Variables we have so far (after filtering some out and re-grouping cathegories with low N: ICR_Org_C_avg num: ICR_K_perc_23A num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICF_Alt_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” Output variables: avg_Y_ratio num: CA_typology factor:

CA Performance as affected by ICR_Org_C_avg

with avg_Y_ratio

Notes: when we look at all the database pooled together, there is no relation at all among SOC and performance of the CA. Howeever, where we split the analysis be Environment, then an interation becomes clear, where there is no relation in mid altitude and the relation is quite significant and positive in Highland and sligtly negative in transition. Conclusion. SOC should remain for the drivers analysis

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_Org_C_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### What if we facet by environment? to see if interactions are masking the effect

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions (adjust if needed)
#----------------------------------------------
# Choose fixed position for annotation
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, lm line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Org_C_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Organic Carbon (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Fit ANOVA model
model_anova <- aov(ICR_Org_C_avg ~ CA_typology, data = drivers_data)

# 2. Tukey HSD post-hoc test
tukey <- TukeyHSD(model_anova)

# 3. Extract group letters
letters_df <- multcompLetters(tukey$CA_typology[,"p adj"])$Letters
letters_df <- data.frame(CA_typology = names(letters_df), Letters = letters_df)

# 4. Calculate group means
means_df <- drivers_data %>%
  group_by(CA_typology) %>%
  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 5. Combine letters and means
plot_labels <- left_join(letters_df, means_df, by = "CA_typology")

# 6. Create boxplot with letters and means at fixed Y positions
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray", color = "black") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7.5, label = Letters),
            fontface = "bold", size = 5, color = "darkred") +
  geom_text(data = plot_labels, aes(x = CA_typology, y = 7, label = mean_label),
            fontface = "plain", size = 4, color = "black") +
  labs(
    title = "ICR_Org_C_avg by CA_typology",
    subtitle = "Letters indicate significant differences (Tukey HSD)",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +
  theme_minimal()

### What if we facet by Environment? myve some interactions masking the effect?

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------

get_letters <- function(df) {

  mod <- aov(ICR_Org_C_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)

  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------

ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +

  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

# Letters
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 9.5, label = Letters),
  inherit.aes = FALSE,
  fontface = "bold",
  size = 3,
  color = "darkred"
) +

# Means
geom_text(
  data = stats_df,
  aes(x = CA_typology, y = 8.25, label = mean_label),
  inherit.aes = FALSE,
  size = 3
) +


  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Org_C_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Organic Carbon (%)"
  ) +

  theme_minimal()

## CA Performance as affected by ICR_K_perc_23A ### with avg_Y_ratio Notes: there is some minimal relation in highland. When we look at the correlation among SOC and K, its relatively high in highland. So we could just use SOC in the drivers analysis and exclude K Conclusion: Keep SCO but explcude K

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_K_perc_23A), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_K_perc_23A, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_K_perc_23A, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_K_perc_23A vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Potassium (%) in 23A",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

### with CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_K_perc_23A ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_K_perc_23A)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_K_perc_23A)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 3.2, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2.8, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_K_perc_23A by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Potassium (%) in 23A"
  ) +

  theme_minimal()
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Since SOC semms to have some explanatory power and K very little, lets see the correlation among them Notes: there is quite a correlation among K and SOC in highland

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_K_perc_23A), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_K_perc_23A ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_K_perc_23A)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Potassium (23A)",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_K_perc_23A (Potassium %)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## CA Performance as affected by ICR_Avail_P_avg ### with avg_Y_ratio NOtes: P seems to have some explanatory power in 2 of the 4 environemnts and the correlation with SOC is not strong. Conclusions: keep it for the drivers analysis

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_Avail_P_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_Avail_P_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Avail_P_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_Avail_P_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Available Phosphorus (ppm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###ICR_Avail_P_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
  mod <- aov(ICR_Avail_P_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_Avail_P_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Avail_P_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  # Letters
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 110, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  # Means
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 100, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_Avail_P_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Available Phosphorus (ppm)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs. ICR_Avail_P_avg

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_Avail_P_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_Avail_P_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3), 
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_Avail_P_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Available Phosphorus",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_Avail_P_avg (Phosphorus ppm)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

## CA Performance as affected by ICR_arable_land_avg

NOtes: no correlation when pooled together but a trant in 2 out of 4 environments. Moreover it is a hosehold variable that is ususally used to explain perfomrance of practices Conclusions: Keep it for drivers analysis

with avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_arable_land_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    x = "Arable Land (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### In facet by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICR_arable_land_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_arable_land_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_arable_land_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values shown per environment",
    x = "Arable land (%)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###ICR_arable_land_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

get_letters <- function(df) {
  mod <- aov(ICR_arable_land_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_arable_land_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_arable_land_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(drivers_data, aes(x = CA_typology, y = ICR_arable_land_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 90, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 82, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +

  labs(
    title = "ICR_arable_land_avg by CA_typology (per environment)",
    subtitle = "Letters = Tukey HSD within each environment",
    x = "CA Typology",
    y = "Arable land (%)"
  ) +
  theme_minimal()

###Correlation: ICR_Org_C_avg vs ICR_arable_land_avg

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

correlation_stats <- drivers_data %>%
  filter(!is.na(ICR_Org_C_avg), !is.na(ICR_arable_land_avg), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_arable_land_avg ~ ICR_Org_C_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR", "\u00B2", " = ", round(r_squared, 2))
  )

label_y <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95

correlation_stats <- correlation_stats %>%
  mutate(x = label_x, y = label_y)

ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_arable_land_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = correlation_stats,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    hjust = 1, vjust = 1,
    size = 4
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Correlation between Organic Carbon and Arable Land",
    subtitle = "Linear regression per environment with p-values and R\u00B2",
    x = "ICR_Org_C_avg (Organic Carbon %)",
    y = "ICR_arable_land_avg (Arable land %)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

## CA Performance as affected by ICF_Alt_avg

NOtes: there seems to be no correlation when pooled together and even less when split by environment. Environment clasiffication is mainly based on altitude so if we are using em=nvironments as driver, altitude can be eliminated Conclusions: Do not use further in drivers analysis

###All together

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_Alt_avg, data = drivers_data)

# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4]  # p-value for slope

# 3. Create plot
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    x = "Altitude (%)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3)),
           size = 5, color = "black") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

with avg_Y_ratio

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
  filter(!is.na(ICF_Alt_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_Alt_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICF_Alt_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Altitude (m)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###ICF_Alt_avg by CA_typology

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function for letters
get_letters <- function(df) {
  mod <- aov(ICF_Alt_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICF_Alt_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_Alt_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 0)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICF_Alt_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 2000, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 1800, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_Alt_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Altitude (m)"
  ) +
  theme_minimal()

CA Performance as affected by ICR_rainfall_avg

NOtes: Rain seems to partially explain CA performance, with a cuadratice-model response when pooled all environmetns together, with 4 mm per day as optimumn. However, when split by environments, linear models can fit as good Conclusions: Keep, consider cuadratic relation and not linear if not grouped within environments ###Simple regression: ICR_rainfall_avg vs avg_Y_ratio

library(ggplot2)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg, data = drivers_data)

# 2. Extract p-value and R-squared from model summary
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  annotate("text", 
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5, color = "black") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### What wbout tieh a cuadratic model?

library(ggplot2)

# 1. Fit quadratic model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2),
            data = drivers_data)

# 2. Extract p-value and R²
p_val <- summary(model)$coefficients["I(ICR_rainfall_avg^2)", 4]
r2 <- summary(model)$r.squared

# 3. Create plot with p and R²
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
annotate("text", 
         x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
         label = paste0("p = ", signif(p_val, 3),
                        "\nR^2 = ", round(r2, 2)),
         size = 5)

###Faceted regression: ICR_rainfall_avg vs avg_Y_ratio by environment

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit separate linear models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    p_label = paste0("p = ", signif(p_value, 3))
  ) %>%
  dplyr::select(IQF_environment, p_label)

# 2. Annotation positions
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_rainfall_avg, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = p_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg vs. avg_Y_ratio",
    subtitle = "Linear regressions and p-values per environment",
    x = "Rainfall (avg, mm)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### same as avobe but with cuadratic model

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

# 1. Fit quadratic models per environment
stats_df <- drivers_data %>%
  filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2), data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients["I(ICR_rainfall_avg^2)", 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),

    # IMPORTANT: use R^2 (not R²)
    stat_label = paste0(
      "p = ", signif(p_value, 3),
      "\nR^2 = ", round(r_squared, 2)
    )
  ) %>%
  dplyr::select(IQF_environment, stat_label)


stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(
    method = "lm",
    formula = y ~ x + I(x^2),
    se = TRUE,
    color = "darkred",
    linetype = "dashed"
  ) +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
labs(
  title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
  subtitle = "Quadratic regressions per environment with p-values and R^2",
  x = "Rainfall (avg, mm)",
  y = "Average Yield Ratio"
)

###Boxplot: ICR_rainfall_avg by CA_typology (with Tukey HSD letters)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

# Function to extract Tukey letters
get_letters <- function(df) {
  mod <- aov(ICR_rainfall_avg ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

# Run stats by environment
stats_df <- drivers_data %>%
  filter(!is.na(CA_typology), !is.na(ICR_rainfall_avg)) %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICR_rainfall_avg, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICR_rainfall_avg)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 7, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology, y = 6.2, label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICR_rainfall_avg by CA_typology (per environment)",
    subtitle = "Tukey HSD: significant differences by letter",
    x = "CA Typology",
    y = "Rainfall (avg, mm)"
  ) +
  theme_minimal()

## CA Performance as affected by Comp_amount_corr_quali

NOtes: It seems to have a significant relation with both Y_relative and typology. Wether pooled all together of per environment, but when split by environment, all show the sam esign of trend, but is clear and significant only for mid-dry and transition Conclusions: Keep for the drivers analysis ###Simple regression: Comp_amount_corr_quali vs avg_Y_ratio

library(ggplot2)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                Comp_amount_corr_quali <= 15)

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    x = "Compost amount (corrected, ≤15)",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###Linear regression per environment (facet + R² + p, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(Comp_amount_corr_quali),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment (compost ≤ 15)",
    x = "Compost amount (corrected)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###Boxplot by CA_typology (Tukey HSD, compost ≤ 15)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(Comp_amount_corr_quali),
                !is.na(IQF_environment),
                Comp_amount_corr_quali <= 15)

get_letters <- function(df) {
  mod <- aov(Comp_amount_corr_quali ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(Comp_amount_corr_quali, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(df_filt, aes(x = CA_typology, y = Comp_amount_corr_quali)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "Comp_amount_corr_quali by CA_typology (per environment)",
    subtitle = "Tukey HSD (compost ≤ 15)",
    x = "CA Typology",
    y = "Compost amount (corrected)"
  ) +
  theme_minimal()

## CA Performance as affected by ICF_planting_date

NOtes: Conclusions: ###Simple regression: ICR_rainfall_avg ICF_planting_date

library(ggplot2)
library(dplyr)

# Filter data
df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio))

# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_planting_date, data = df_filt)

# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared

# 3. Create plot
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  annotate("text",
           x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = paste0("p = ", signif(p_val, 3),
                          "\nR^2 = ", round(r2, 2)),
           size = 5) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###Linear regression per environment (facet + R^2 + p)

library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(ICF_planting_date),
                !is.na(avg_Y_ratio),
                !is.na(IQF_environment))

#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(avg_Y_ratio ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 5,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. avg_Y_ratio",
    subtitle = "Linear regressions per environment",
    x = "Planting date",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

###Boxplot by CA_typology (Tukey HSD)

library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)

df_filt <- drivers_data %>%
  dplyr::filter(!is.na(CA_typology),
                !is.na(ICF_planting_date),
                !is.na(IQF_environment))

get_letters <- function(df) {
  mod <- aov(ICF_planting_date ~ CA_typology, data = df)
  tuk <- TukeyHSD(mod)
  lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters

  data.frame(
    CA_typology = names(lets),
    Letters = lets,
    stringsAsFactors = FALSE
  )
}

stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    letters = map(data, get_letters),
    means = map(data, ~ .x %>%
                  group_by(CA_typology) %>%
                  summarise(mean_val = mean(ICF_planting_date, na.rm = TRUE),
                            .groups = "drop"))
  ) %>%
  unnest(letters) %>%
  left_join(
    unnest(dplyr::select(., IQF_environment, means)),
    by = c("IQF_environment", "CA_typology")
  ) %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(df_filt, aes(x = CA_typology, y = ICF_planting_date)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +

  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95,
        label = Letters),
    inherit.aes = FALSE,
    fontface = "bold",
    size = 3,
    color = "darkred"
  ) +
  geom_text(
    data = stats_df,
    aes(x = CA_typology,
        y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.85,
        label = mean_label),
    inherit.aes = FALSE,
    size = 3
  ) +

  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date by CA_typology (per environment)",
    subtitle = "Tukey HSD",
    x = "CA Typology",
    y = "Planting date"
  ) +
  theme_minimal()

###How does delay in planting relate to rain over the season?

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(ICF_planting_date),
    !is.na(ICR_rainfall_avg),
    !is.na(IQF_environment)
  )

# 2. Fit linear model per environment
stats_df <- df_filt %>%
  group_by(IQF_environment) %>%
  nest() %>%
  mutate(
    model = map(data, ~ lm(ICR_rainfall_avg ~ ICF_planting_date, data = .x)),
    p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
    r_squared = map_dbl(model, ~ summary(.x)$r.squared),
    stat_label = paste0("p = ", signif(p_value, 3),
                        "\nR^2 = ", round(r_squared, 2))
  ) %>%
  dplyr::select(IQF_environment, stat_label)

# 3. Annotation position
label_y <- max(df_filt$ICR_rainfall_avg, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95

stats_df <- stats_df %>%
  mutate(x = label_x, y = label_y)

# 4. Plot
ggplot(df_filt, aes(x = ICF_planting_date, y = ICR_rainfall_avg)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
  geom_text(
    data = stats_df,
    aes(x = x, y = y, label = stat_label),
    inherit.aes = FALSE,
    size = 4,
    hjust = 1,
    vjust = 1
  ) +
  facet_wrap(~ IQF_environment) +
  labs(
    title = "ICF_planting_date vs. ICR_rainfall_avg",
    subtitle = "Linear regression per environment (p-value and R^2 shown)",
    x = "Planting date",
    y = "Rainfall average (mm)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##Factorial variables ## CA Performance as affected by IQR_TF_tubura_client NOtes: Conclusions: ###IQR_TF_tubura_client with Y_ratio

library(ggplot2)
library(dplyr)

# 1. Filter valid data
df_filt <- drivers_data %>%
  dplyr::filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_TF_tubura_client),
    !IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_TF_tubura_client = droplevels(IQR_TF_tubura_client))  # drop unused levels

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_test <- kruskal.test(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
kruskal_p <- kruskal_test$p.value

# 4. Build annotation label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Calculate group means
mean_df <- df_filt %>%
  group_by(IQR_TF_tubura_client) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Plot boxplot with p-values and means
ggplot(df_filt, aes(x = IQR_TF_tubura_client, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Means under each box
  geom_text(
    data = mean_df,
    aes(x = IQR_TF_tubura_client,
        y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
        label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # P-value annotation
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_TF_tubura_client",
    subtitle = "ANOVA and Kruskal–Wallis test results shown",
    x = "Tubura client status",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_TF_tubura_client with CA_typology

library(ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_TF_tubura_client),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
  )

# Calculate N per category of explanatory variable
n_labels <- df_bar %>%
  group_by(IQR_TF_tubura_client) %>%
  summarise(n = n(), .groups = "drop")

# Plot with N annotations
ggplot(df_bar, aes(x = IQR_TF_tubura_client, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_TF_tubura_client, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Tubura client status",
    subtitle = "Proportions with total sample size per group",
    x = "IQR_TF_tubura_client",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# Build contingency table
tab <- table(df_bar$IQR_TF_tubura_client, df_bar$CA_typology)

# Run chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
# Print results
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 6, p-value = NA
# Fisher’s Exact Test (for categorical association)
fisher_test <- fisher.test(tab)

# Print results
fisher_test
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.673
## alternative hypothesis: two.sided

CA Performance as affected by IQR_plot_fert_quality

NOtes: Conclusions: ###IQR_plot_fert_quality with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_plot_fert_quality),
    !IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_plot_fert_quality = factor(IQR_plot_fert_quality, levels = c("low_quality", "medium", "high")))


# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Tukey HSD and Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_plot_fert_quality`[, "p adj"])$Letters

letters_df <- data.frame(
  IQR_plot_fert_quality = names(letters),
  Letters = letters,
  stringsAsFactors = FALSE
)

# 4. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)$p.value
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means and letters
labels_df <- left_join(means_df, letters_df, by = "IQR_plot_fert_quality")

# 7. Plot
ggplot(df_filt, aes(x = IQR_plot_fert_quality, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Add Tukey letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Add means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_plot_fert_quality, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Add test label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_plot_fert_quality",
    subtitle = "Tukey HSD letters, means, and statistical tests",
    x = "Plot Fertility Quality",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_plot_fert_quality with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_plot_fert_quality),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(
    IQR_plot_fert_quality = factor(IQR_plot_fert_quality,
                                   levels = c("high", "medium", "low_quality"))  # Desired order
  )

# 2. Calculate N per category
n_labels <- df_bar %>%
  group_by(IQR_plot_fert_quality) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot
ggplot(df_bar, aes(x = IQR_plot_fert_quality, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_plot_fert_quality, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Fertilizer Quality",
    subtitle = "Proportions with total sample size per group",
    x = "IQR_plot_fert_quality",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Chi-squared test
tab <- table(df_bar$IQR_plot_fert_quality, df_bar$CA_typology)
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 3.8727, df = 6, p-value = 0.6939
# 5. Optional: Fisher’s Exact Test
fisher_test <- fisher.test(tab)
fisher_test
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.5699
## alternative hypothesis: two.sided

CA Performance as affected by IQR_soil_texture

NOtes: Conclusions: ###IQR_soil_texture with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_texture),
    !IQR_soil_texture %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_soil_texture = droplevels(IQR_soil_texture))

# Optional: Reorder texture levels manually (if desired)
# Example order: "sand", "loam", "clay", etc.
df_filt <- df_filt %>%
  mutate(IQR_soil_texture = factor(IQR_soil_texture, levels = c("Fine", "Mix", "Coarse")))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)$p.value

# 4. Tukey HSD and compact letter display
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_texture`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_texture = names(letters), Letters = letters)

# 5. Compute group means
means_df <- df_filt %>%
  group_by(IQR_soil_texture) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_texture")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_texture, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below box
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_texture, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  # Statistical tests label
  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_texture",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Texture",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_soil_texture with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_soil_texture),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_soil_texture %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQR_soil_texture = droplevels(as.factor(IQR_soil_texture))) %>%
  mutate(IQR_soil_texture = factor(IQR_soil_texture ,
                                   levels = c("Fine", "Mix", "Coarse")))
    # Desired order

# 2. Calculate N per level of explanatory variable
n_labels <- df_bar %>%
  group_by(IQR_soil_texture) %>%
  summarise(n = n(), .groups = "drop")

# 3. Bar plot
ggplot(df_bar, aes(x = IQR_soil_texture, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_soil_texture, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Soil Texture",
    subtitle = "Proportions with total sample size per texture class",
    x = "IQR_soil_texture",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$IQR_soil_texture, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.0014, df = 6, p-value = 0.423

CA Performance as affected by IQR_soil_color

NOtes: Conclusions: ###IQR_soil_color with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean the data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQR_soil_color),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  ) %>%
  mutate(IQR_soil_color = droplevels(IQR_soil_color))

# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = df_filt)$p.value

# 4. Tukey HSD and letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_color`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_color = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQR_soil_color) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters and means
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_color")

# 7. Test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below boxes
  geom_text(
    data = labels_df,
    aes(x = IQR_soil_color, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQR_soil_color",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Soil Color",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQR_soil_color with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQR_soil_color),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQR_soil_color %in% c("no data", "---", "", "#N/A")
  )

# 2. Calculate N per category
n_labels <- df_bar %>%
  group_by(IQR_soil_color) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot
ggplot(df_bar, aes(x = IQR_soil_color, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQR_soil_color, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Soil Color",
    subtitle = "Proportional bars with total sample size",
    x = "Soil Color",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Chi-squared and Fisher’s test
tab <- table(df_bar$IQR_soil_color, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 15, p-value = NA
# Fisher’s Exact Test (with simulation to avoid memory issues)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.651
## alternative hypothesis: two.sided

CA Performance as affected by IQF_position_slope

NOtes: Conclusions: ###IQF_position_slope with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_position_slope),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_position_slope = droplevels(IQF_position_slope))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_position_slope`[, "p adj"])$Letters
letters_df <- data.frame(IQF_position_slope = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_position_slope) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQF_position_slope")

# 7. Statistical test label
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Tukey HSD letters (above)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means (below)
  geom_text(
    data = labels_df,
    aes(x = IQF_position_slope, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_position_slope",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Slope Position",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQF_position_slope with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean and prepare data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQF_position_slope),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQF_position_slope %in% c("no data", "---", "", "NA")
  )

# 2. Calculate total N per slope category
n_labels <- df_bar %>%
  group_by(IQF_position_slope) %>%
  summarise(n = n(), .groups = "drop")

# 3. Plot proportion bar chart
ggplot(df_bar, aes(x = IQF_position_slope, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQF_position_slope, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Position on Slope",
    subtitle = "Proportions and total N per slope category",
    x = "Slope Position",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Run statistical tests
tab <- table(df_bar$IQF_position_slope, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 18, p-value = NA
# Fisher’s Exact Test with simulation fallback
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.1499
## alternative hypothesis: two.sided

CA Performance as affected by Weed_species_A_combined

NOtes: Conclusions: ###Weed_species_A_combined with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(Weed_species_A_combined),
    !Weed_species_A_combined %in% c("no data", "-", "—", "NA")
  ) %>%
  mutate(Weed_species_A_combined = droplevels(Weed_species_A_combined))


# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal–Wallis
kruskal_p <- kruskal.test(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)$p.value

# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`Weed_species_A_combined`[, "p adj"])$Letters
letters_df <- data.frame(Weed_species_A_combined = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(Weed_species_A_combined) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "Weed_species_A_combined")

# 7. Combine test labels
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = Weed_species_A_combined, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = Weed_species_A_combined, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by Weed_species_A_combined",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Weed Species (Combined)",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###Weed_species_A_combined with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Filter and clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(Weed_species_A_combined),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !Weed_species_A_combined %in% c("no data", "—", "", "NA")
  )

# 2. Total N per weed category
n_labels <- df_bar %>%
  group_by(Weed_species_A_combined) %>%
  summarise(n = n(), .groups = "drop")

# 3. Proportional bar plot
ggplot(df_bar, aes(x = Weed_species_A_combined, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = Weed_species_A_combined, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology distribution by Dominant Weed Species",
    subtitle = "Stacked bars show % CA_typology, with N on top",
    x = "Weed Species Group (A combined)",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$Weed_species_A_combined, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = NaN, df = 12, p-value = NA
# Fisher’s Exact Test with simulated p-value (for large tables)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.3736
## alternative hypothesis: two.sided

CA Performance as affected by IQF_environment

NOtes: Conclusions: ###IQF_environment with Y_ratio

library(ggplot2)
library(dplyr)
library(multcompView)

# 1. Filter and clean data
df_filt <- drivers_data %>%
  filter(
    !is.na(avg_Y_ratio),
    !is.na(IQF_environment),
    !IQF_environment %in% c("no data", "---", "", "NA")
  ) %>%
  mutate(IQF_environment = droplevels(factor(IQF_environment)))
df_filt <- df_filt %>%
  mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))

# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_environment, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]

# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_environment, data = df_filt)$p.value

# 4. Tukey HSD + Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_environment`[, "p adj"])$Letters
letters_df <- data.frame(IQF_environment = names(letters), Letters = letters)

# 5. Group means
means_df <- df_filt %>%
  group_by(IQF_environment) %>%
  summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
  mutate(mean_label = paste0("mean = ", round(mean_val, 2)))

# 6. Combine letters + means
labels_df <- left_join(means_df, letters_df, by = "IQF_environment")

# 7. Test summary
test_label <- paste0(
  "ANOVA p = ", signif(anova_p, 3),
  "\nKruskal p = ", signif(kruskal_p, 3)
)

# 8. Plot
ggplot(df_filt, aes(x = IQF_environment, y = avg_Y_ratio)) +
  geom_boxplot(outlier.shape = NA, fill = "lightgray") +
  geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +

  # Letters above
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
    inherit.aes = FALSE,
    fontface = "bold", size = 5, color = "darkred"
  ) +

  # Means below
  geom_text(
    data = labels_df,
    aes(x = IQF_environment, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
    inherit.aes = FALSE,
    size = 4
  ) +

  annotate("text",
           x = Inf, y = Inf,
           hjust = 1.1, vjust = 1.5,
           label = test_label,
           size = 5) +

  labs(
    title = "avg_Y_ratio by IQF_environment",
    subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
    x = "Environment Type",
    y = "Average Yield Ratio"
  ) +
  theme_minimal()

###IQF_environment with CA_typology

library(ggplot2)
library(dplyr)
library(scales)

# 1. Clean data
df_bar <- drivers_data %>%
  filter(
    !is.na(CA_typology),
    !is.na(IQF_environment),
    !CA_typology %in% c("no data", "---", "", "NA"),
    !IQF_environment %in% c("no data", "---", "", "NA")
  )
df_bar <- df_bar %>%
  mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))

# 2. Total N per environment
n_labels <- df_bar %>%
  group_by(IQF_environment) %>%
  summarise(n = n(), .groups = "drop")

# 3. Proportional bar plot
ggplot(df_bar, aes(x = IQF_environment, fill = CA_typology)) +
  geom_bar(position = "fill", color = "black") +
  geom_text(
    data = n_labels,
    aes(x = IQF_environment, y = 1.02, label = paste0("N = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "CA_typology Distribution by Environment",
    subtitle = "Proportional bars with total N per environment",
    x = "IQF Environment",
    y = "Proportion of CA Typology",
    fill = "CA Typology"
  ) +
  theme_minimal()

# 4. Statistical tests
tab <- table(df_bar$IQF_environment, df_bar$CA_typology)

# Chi-squared test
chi_test <- chisq.test(tab)
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 25.835, df = 9, p-value = 0.002174
# Fisher’s Exact Test with simulation
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  1e+05 replicates)
## 
## data:  tab
## p-value = 0.00753
## alternative hypothesis: two.sided