#Libraries and Data upload
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 11921 Columns: 94
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (54): IQR_Season, Seas_block_treat, IQR_SeasAB, Sample code, IQF_agzone,...
## dbl (39): ICF_Lat, ICF_Long, ICF_Alt, ICR_OM_perc, ICR_landprep_pers_usd_ha,...
## num (1): DC_cost_usd_ha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)
ca_raw <- ca_raw %>%
mutate(
IQF_environment = case_when(
IQF_environment %in% c("Mid Low Dry", "Mid_Low_Dry") ~ "Mid_Low_Dry",
IQF_environment %in% c("Mid Low Wet", "Mid_Low_Wet") ~ "Mid_Low_Wet",
TRUE ~ IQF_environment
)
)
Some numeric columns are read as characters. We’ll define two
cleaning functions: 1. convert_numeric_chars() - converts
any character column to numeric if all values are numeric. 2.
clean_numeric_columns() - replaces invalid strings
(e.g. “na”, “NA”) and converts to numeric if most values are numbers. ##
Numeric/factoruian check
convert_numeric_chars <- function(df) {
df %>%
mutate(across(where(is.character), ~ {
if (all(grepl("^[-]?[0-9]*\\.?[0-9]+$", .x[!is.na(.x)]))) {
as.numeric(.x)
} else {
.x
}
}))
}
# Apply it
ca_raw <- convert_numeric_chars(ca_raw)
# Function to convert character columns to numeric when appropriate
clean_numeric_columns <- function(df) {
df <- df %>%
mutate(across(where(is.character), function(x) {
# Replace "NA", "na", "" with real NA
x[x %in% c("NA", "na", "")] <- NA
# Try converting to numeric
x_num <- suppressWarnings(as.numeric(x))
# If most non-NA values are numeric, keep it
if (sum(!is.na(x_num)) > 0.9 * sum(!is.na(x))) {
return(x_num)
} else {
return(x)
}
}))
return(df)
}
# Apply it to ca_raw
ca_raw <- clean_numeric_columns(ca_raw)
# Check summary of converted classes
table(sapply(ca_raw, class))
##
## character numeric
## 39 55
library(dplyr)
library(tidyr)
summary_farmers_raw <- ca_raw %>%
filter(!is.na(IQR_Season), !is.na(IQR_block)) %>%
group_by(IQR_Season) %>%
summarise(
# total farmers (no yield / treatment conditions)
total_farmers = n_distinct(IQR_block),
# farmers who have BOTH A and C with yield not NA and not 0
farmers_with_A_and_C = n_distinct(IQR_block[
Treat_code %in% c("A","C") &
!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
IQR_block, FUN = any) &
ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="C",
IQR_block, FUN = any)
]),
# farmers who have A AND (B OR C) with yield not NA and not 0
farmers_with_A_and_BorC = n_distinct(IQR_block[
Treat_code %in% c("A","B","C") &
!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
IQR_block, FUN = any) &
ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code %in% c("B","C"),
IQR_block, FUN = any)
]),
.groups = "drop"
) %>%
mutate(
farmers_with_A_and_C = replace_na(farmers_with_A_and_C, 0),
farmers_with_A_and_BorC = replace_na(farmers_with_A_and_BorC, 0),
pct_with_A_and_C = round(100 * farmers_with_A_and_C / total_farmers, 1),
pct_with_A_and_BorC = round(100 * farmers_with_A_and_BorC / total_farmers, 1)
)
summary_farmers_raw
## # A tibble: 6 × 6
## IQR_Season total_farmers farmers_with_A_and_C farmers_with_A_and_BorC
## <chr> <int> <int> <int>
## 1 23A 580 559 559
## 2 23B 579 506 507
## 3 24A 495 469 471
## 4 24B 477 472 472
## 5 25A 457 441 442
## 6 25B 449 441 442
## # ℹ 2 more variables: pct_with_A_and_C <dbl>, pct_with_A_and_BorC <dbl>
#Data cleaning ## Clean for only farm-season with A and C, and that they are not 0 or NA
library(dplyr)
ca_raw <- ca_raw %>%
mutate(
IQF_SeasonblockID = if_else(
is.na(IQF_SeasonblockID),
paste(IQR_block, IQR_Season, sep = "_"),
IQF_SeasonblockID
)
)
ca_TATC <- ca_raw %>%
# keep only A and C rows
filter(Treat_code %in% c("A", "C")) %>%
# group by the paired unit
group_by(IQF_SeasonblockID) %>%
# keep only blocks where BOTH A and C exist
filter(all(c("A", "C") %in% Treat_code)) %>%
# and BOTH have valid (non-NA, non-zero) yields
filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0)) %>%
ungroup()
ca_TATC
## # A tibble: 5,776 × 94
## IQR_Season Seas_block_treat IQR_SeasAB `Sample code` IQF_agzone IQF_region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 23A 23A79_15C A 23A_i_P2_79_15_T Mayaga-Bu… S
## 2 23A 23A79_15A A 23A_i_P2_79_15_T Mayaga-Bu… S
## 3 23A 23A58_10A A 23A_i_P2_58_10_T Central P… NW
## 4 23A 23A58_10C A 23A_i_P2_58_10_T Central P… NW
## 5 23A 23A53_4C A 23A_i_P2_53_4_T Volcanic … NW
## 6 23A 23A53_4A A 23A_i_P2_53_4_T Volcanic … NW
## 7 23A 23A38_15A A 23A_i_P2_38_15_T Buberuka … NE
## 8 23A 23A38_15C A 23A_i_P2_38_15_T Buberuka … NE
## 9 23A 23A9_14A A 23A_i_P2_9_14_T Eastern R… E
## 10 23A 23A9_14C A 23A_i_P2_9_14_T Eastern R… E
## # ℹ 5,766 more rows
## # ℹ 88 more variables: IQF_District <chr>, IQF_environment <chr>,
## # IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## # IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## # IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## # IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## # ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>, …
ca_TATC %>%
count(IQF_SeasonblockID)
## # A tibble: 2,888 × 2
## IQF_SeasonblockID n
## <chr> <int>
## 1 102_10_23A 2
## 2 102_10_23B 2
## 3 102_10_24A 2
## 4 102_10_24B 2
## 5 102_10_25A 2
## 6 102_10_25B 2
## 7 102_11_23A 2
## 8 102_11_23B 2
## 9 102_11_24A 2
## 10 102_11_24B 2
## # ℹ 2,878 more rows
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(broom.mixed)
## Warning: package 'broom.mixed' was built under R version 4.3.3
library(dplyr)
model_data <- ca_TATC %>%
filter(
!is.na(DC_Yield_t_ha),
DC_Yield_t_ha != 0,
!is.na(Treat_code),
!is.na(IQR_Season),
!is.na(IQF_environment),
!is.na(IQR_block)
) %>%
mutate(
DC_Yield_t_ha = as.numeric(DC_Yield_t_ha),
Treat_code = factor(Treat_code),
IQR_Season = factor(IQR_Season),
IQF_environment = factor(IQF_environment),
IQR_block = factor(IQR_block)
)
###Linear mixed model for outlier detection
model_resid <- lmer(
DC_Yield_t_ha ~ Treat_code * IQR_Season +
(1 | IQR_Season) +
(1 | IQF_environment) +
(1 | IQR_block),
data = model_data,
REML = TRUE
)
resid_data <- augment(model_resid, data = model_data) %>%
mutate(
std_resid = as.numeric(scale(.resid)),
is_outlier = abs(std_resid) > 3
)
outlier_counts <- resid_data %>%
filter(is_outlier) %>%
group_by(IQR_Season, Treat_code) %>%
summarise(
n_outliers = n(),
.groups = "drop"
)
print(outlier_counts)
## # A tibble: 6 × 3
## IQR_Season Treat_code n_outliers
## <fct> <fct> <int>
## 1 23A A 13
## 2 23A C 4
## 3 24A A 17
## 4 24A C 7
## 5 25A A 8
## 6 25A C 7
###Inspect extreme observations
resid_data %>%
filter(is_outlier) %>%
select(
IQR_Season,
Treat_code,
IQR_block,
IQF_environment,
DC_Yield_t_ha,
std_resid
) %>%
arrange(desc(abs(std_resid)))
## # A tibble: 56 × 6
## IQR_Season Treat_code IQR_block IQF_environment DC_Yield_t_ha std_resid
## <fct> <fct> <fct> <fct> <dbl> <dbl>
## 1 24A A 53_4 Highland 18.1 8.22
## 2 24A A 53_3 Highland 19.4 8.14
## 3 24A A 53_5 Highland 16.9 7.85
## 4 23A A 17_6 Mid_Low_Dry 16.2 7.27
## 5 24A C 53_5 Highland 14.9 7.19
## 6 24A C 53_4 Highland 15.4 7.00
## 7 24A C 53_3 Highland 15.7 6.07
## 8 23A A 17_1 Mid_Low_Dry 16.7 6.07
## 9 24A A 49_2 Highland 14.4 5.98
## 10 24A A 53_1 Highland 14.1 5.15
## # ℹ 46 more rows
library(dplyr)
library(ggplot2)
# Prepare base plotting data
plot_data <- ca_TATC %>%
filter(
!is.na(DC_Yield_t_ha),
DC_Yield_t_ha != 0,
!is.na(Treat_code),
!is.na(IQR_Season),
!is.na(IQF_agzone)
) %>%
mutate(
Treat_code = factor(Treat_code),
IQR_Season = factor(IQR_Season),
IQF_agzone = factor(IQF_agzone)
)
# Join LMM outlier flag
plot_data_lmm <- plot_data %>%
left_join(
resid_data %>%
select(
IQR_Season,
Treat_code,
IQR_block,
is_outlier_lmm = is_outlier
),
by = c("IQR_Season", "Treat_code", "IQR_block")
) %>%
mutate(
is_outlier_lmm = ifelse(is.na(is_outlier_lmm), FALSE, is_outlier_lmm)
)
ggplot(plot_data_lmm, aes(x = Treat_code, y = DC_Yield_t_ha)) +
geom_boxplot(outlier.shape = NA, fill = "gray90") +
geom_jitter(
aes(color = is_outlier_lmm),
width = 0.2,
alpha = 0.7
) +
scale_color_manual(
values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Regular observation", "TRUE" = "Model outlier")
) +
facet_grid(IQF_agzone ~ IQR_Season, scales = "free_y") +
labs(
title = "Yield (t/ha) by Treatment and Season",
subtitle = "Red points indicate outliers detected by linear mixed-effects model (|std. resid| > 4)",
x = "Treatment Code",
y = "Yield (t/ha)",
color = ""
) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.text = element_text(size = 9)
)
library(dplyr)
threshold <- 0.1
block_season_flags <- ca_TATC %>%
filter(!is.na(IQF_SeasonblockID), !is.na(Treat_code)) %>%
mutate(DC_Yield_t_ha = as.numeric(DC_Yield_t_ha)) %>%
group_by(IQF_SeasonblockID, IQR_Season) %>%
summarise(
# both A and C exist AND both are < 0.1
AC_both_lt0.1 =
any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) &
any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),
# A OR B is < 0.1 (either one, if present)
AorB_lt0.1 =
any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) |
any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),
.groups = "drop"
)
# ---- Overall counts (unique block-season combinations) ----
overall_counts <- block_season_flags %>%
summarise(
total_block_season = n(),
n_AC_lt0.1 = sum(AC_both_lt0.1),
n_AorB_lt0.1 = sum(AorB_lt0.1),
prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3)
)
overall_counts
## # A tibble: 1 × 5
## total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1 prop_AorB_lt0.1
## <int> <int> <int> <dbl> <dbl>
## 1 2888 1 14 0 0.005
# ---- By season (optional) ----
by_season_counts <- block_season_flags %>%
group_by(IQR_Season) %>%
summarise(
total_block_season = n(),
n_AC_lt0.1 = sum(AC_both_lt0.1),
n_AorB_lt0.1 = sum(AorB_lt0.1),
prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3),
.groups = "drop"
)
by_season_counts
## # A tibble: 6 × 6
## IQR_Season total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1
## <chr> <int> <int> <int> <dbl>
## 1 23A 559 0 0 0
## 2 23B 506 1 7 0.002
## 3 24A 469 0 1 0
## 4 24B 472 0 6 0
## 5 25A 441 0 0 0
## 6 25B 441 0 0 0
## # ℹ 1 more variable: prop_AorB_lt0.1 <dbl>
clean databse with no NA or <0.1 for yield and cleaned removing outliers (bothe treatments for theat season-farmer it at lease one is outlier)
library(dplyr)
# --- 1) Identify model outliers (if you already have resid_data, skip to step 2) ---
# NOTE: requires lme4, lmerTest, broom.mixed loaded and model fitted as you did
# resid_data should contain: IQR_Season, Treat_code, IQR_block, IQF_environment, .resid, is_outlier
# (from: resid_data <- augment(model_resid, data = model_data) %>% mutate(std_resid=..., is_outlier=...))
# --- 2) Drop ALL outlier blocks (balanced A/C by SeasonblockID) ---
# Rule: if either A or C is flagged as outlier within a given IQF_SeasonblockID,
# remove the entire IQF_SeasonblockID (both treatments), so pairs remain balanced.
outlier_ids <- resid_data %>%
filter(is_outlier) %>%
distinct(IQF_SeasonblockID)
## not outliers
ca_AC_clean <- ca_TATC %>%
# keep only A and C (optional, but usually desired for paired A/C analysis)
filter(Treat_code %in% c("A", "C")) %>%
# keep only complete A/C pairs with valid yields (recommended)
group_by(IQF_SeasonblockID) %>%
filter(all(c("A","C") %in% Treat_code)) %>%
filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha >= 0.1)) %>%
ungroup() %>%
# drop any SeasonblockID that had an outlier in either treatment
anti_join(outlier_ids, by = "IQF_SeasonblockID")
# Quick checks
cat("Rows before:", nrow(ca_TATC), "\n")
## Rows before: 5776
cat("Rows after :", nrow(ca_AC_clean), "\n")
## Rows after : 5654
# Confirm balance: should be 2 rows (A and C) per SeasonblockID
balance_check <- ca_AC_clean %>%
count(IQF_SeasonblockID) %>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
print(balance_check)
## # A tibble: 1 × 2
## n nn
## <int> <int>
## 1 2 2827
library(tidyverse)
ca_AC_clean <- ca_AC_clean %>%
mutate(
year = str_sub(IQR_Season, 1, 2),
crop_code = str_sub(IQR_Season, -1),
crop = case_when(
crop_code == "A" ~ "Maize",
crop_code == "B" ~ "Beans",
TRUE ~ NA_character_
)
)
ca_AC_clean$block_season = paste0(ca_AC_clean$IQR_block, "_", ca_AC_clean$IQR_Season)
#New database of column of yield ratio CONV_CA_Ratio
library(dplyr)
CONV_CA_Ratio <- ca_AC_clean %>%
# Convert profit column
mutate(
DC_profit_usd_ha = as.numeric(gsub(",", "", trimws(DC_profit_usd_ha)))
) %>%
# Keep only relevant treatments
filter(Treat_code %in% c("A", "C")) %>%
filter(!is.na(DC_Yield_t_ha), !is.na(DC_profit_usd_ha)) %>%
# Group by block-season
group_by(IQR_block, IQR_Season) %>%
# NEW: keep only groups with both A and C after filtering
filter(n_distinct(Treat_code) == 2) %>%
# Mutate safely
mutate(
yield_A = DC_Yield_t_ha[Treat_code == "A"],
yield_C = DC_Yield_t_ha[Treat_code == "C"],
profit_A = DC_profit_usd_ha[Treat_code == "A"],
profit_C = DC_profit_usd_ha[Treat_code == "C"],
Y_ratio = yield_C / yield_A,
Profit_ratio = profit_C / profit_A
) %>%
# Keep only one row per group (e.g., the C row)
filter(Treat_code == "C") %>%
ungroup() %>%
filter(is.finite(Y_ratio), is.finite(Profit_ratio))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DC_profit_usd_ha = as.numeric(gsub(",", "",
## trimws(DC_profit_usd_ha)))`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(tidyr)
A_vs_C_prop <- ca_AC_clean %>%
filter(Treat_code %in% c("A", "C")) %>%
select(
IQR_Season,
IQF_environment,
IQF_SeasonblockID,
Treat_code,
DC_Yield_t_ha
) %>%
pivot_wider(
names_from = Treat_code,
values_from = DC_Yield_t_ha
) %>%
filter(!is.na(A), !is.na(C)) %>%
group_by(IQR_Season, IQF_environment) %>%
summarise(
n_blocks = n(),
prop_A_greater_C = mean(A > C),
prop_C_greater_A = mean(C > A),
prop_equal = mean(A == C),
.groups = "drop"
)
A_vs_C_prop
## # A tibble: 24 × 6
## IQR_Season IQF_environment n_blocks prop_A_greater_C prop_C_greater_A
## <chr> <chr> <int> <dbl> <dbl>
## 1 23A Highland 120 0.692 0.308
## 2 23A Mid_Low_Dry 173 0.728 0.266
## 3 23A Mid_Low_Wet 145 0.745 0.255
## 4 23A Transition 100 0.68 0.31
## 5 23B Highland 111 0.694 0.270
## 6 23B Mid_Low_Dry 160 0.906 0.0875
## 7 23B Mid_Low_Wet 132 0.773 0.212
## 8 23B Transition 90 0.756 0.233
## 9 24A Highland 104 0.971 0.0288
## 10 24A Mid_Low_Dry 146 0.753 0.233
## # ℹ 14 more rows
## # ℹ 1 more variable: prop_equal <dbl>
##Visual distribution of CONV advantage vs CA_Yield
library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
geom_density(alpha = 0.3) +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(title = "Density of Relative Yield (CA / CONV) by Agzone")
##Visual distribution of CONV advantage vs CA_profit
ggplot(CONV_CA_Ratio, aes(x = Profit_ratio, fill = IQF_environment)) +
geom_density(alpha = 0.5, adjust = 1.2) + # <-- smoother
geom_vline(xintercept = 1, linetype = "dashed") +
scale_fill_brewer(palette = "Pastel1") +
coord_cartesian(xlim = c(-1, 6)) +
labs(
title = "Density of Relative Profit (CA / CONV) by Agzone",
x = "Profit_ratio",
y = "Density"
) +
theme_minimal()
## Another way to see the disctribution of the advantage of CONV_Yield
Overall about 22% of the block-season combinations had CA as a similar
or better performed than CONV
library(dplyr)
library(ggplot2)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
arrange(Y_ratio) %>%
filter(is.finite(Y_ratio)) %>%
mutate(
season_crop = case_when(
IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
TRUE ~ "Other"
),
cum_percent = 100 * (row_number() / n())
)
# 2. Estimate intersection y-value at x = 1 (Y_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
group_by(IQF_environment) %>%
filter(n() >= 5) %>%
do({
tryCatch({
model <- loess(cum_percent ~ Y_ratio, data = .)
y_at_1 <- predict(model, newdata = data.frame(Y_ratio = 1))
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
}, error = function(e) {
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
})
}) %>%
ungroup()
# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")
# 4. Plot
ggplot(plot_data, aes(x = Y_ratio, y = cum_percent)) +
geom_point(aes(color = season_crop), size = 2) +
geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
geom_text(
data = intersection_lines,
aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
inherit.aes = FALSE,
color = "blue",
size = 3,
hjust = 0
) +
scale_color_manual(
values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
) +
scale_x_continuous(limits = c(0, 2.5)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap(~ IQF_environment, scales = "free_y") +
labs(
x = "Relative Yield (CA / CONV)",
y = "Cumulative % of Observations",
title = "Cumulative Frequency of Relative Yield at Block Level",
subtitle = "Dashed = 1 yield threshold; Dotted = intersect with cumulative curve",
color = "Season Crop Type"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
strip.text = element_text(face = "bold"),
legend.position = "right"
)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
##Same distribution but for profit IN terms of profit, the advantage of
CONV is a little larger
library(dplyr)
library(ggplot2)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
arrange(Profit_ratio) %>%
filter(is.finite(Profit_ratio)) %>%
mutate(
season_crop = case_when(
IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
TRUE ~ "Other"
),
cum_percent = 100 * (row_number() / n())
)
# 2. Estimate intersection y-value at x = 1 (Profit_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
group_by(IQF_environment) %>%
filter(n() >= 5) %>%
do({
tryCatch({
model <- loess(cum_percent ~ Profit_ratio, data = .)
y_at_1 <- predict(model, newdata = data.frame(Profit_ratio = 1))
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
}, error = function(e) {
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
})
}) %>%
ungroup()
# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")
# 4. Plot
ggplot(plot_data, aes(x = Profit_ratio, y = cum_percent)) +
geom_point(aes(color = season_crop), size = 2, alpha = 0.6) +
geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
geom_text(
data = intersection_lines,
aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
inherit.aes = FALSE,
color = "blue",
size = 3,
hjust = 0
) +
scale_color_manual(
values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
) +
scale_x_continuous(limits = c(-2, 2.5)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap(~ IQF_environment, scales = "free_y") +
labs(
x = "Relative Profit (CA / CONV)",
y = "Cumulative % of Observations",
title = "Cumulative Frequency of Relative Profit at Block Level",
subtitle = "Dashed = 1 profit threshold; Dotted = intersect with cumulative curve",
color = "Season Crop Type"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
strip.text = element_text(face = "bold"),
legend.position = "right"
)
## Warning: Removed 119 rows containing missing values or values outside the scale range
## (`geom_point()`).
## What is the relation among performance in Yield and Performance in
Profit separating by crop?
library(dplyr)
library(ggplot2)
# 1. Prepare filtered data
regression_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
is.finite(Profit_ratio),
Profit_ratio > -5, Profit_ratio < 5,
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
)
# 2. Create plot with regression line and 1:1 reference line
ggplot(regression_data, aes(x = Y_ratio, y = Profit_ratio)) +
geom_point(alpha = 0.4, color = "gray40", size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "blue", linewidth = 0.8) +
facet_wrap(~crop_type) +
labs(
title = "Relationship Between Yield Ratio and Profit Ratio by Crop",
x = "Relative Yield (Y_ratio: CA / CONV)",
y = "Relative Profit (Profit_ratio: CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
#How does the performance of CA is affected by the environmental index (yield of the control?) ## First we construct an environment index This is the relative yield of each of the treatments in a block as compared to all the blocks of that treatment in the same season and environment. Theh we average the relative yield of all the treatments in a block for a season and we have the environmental indeal of it expresed as a ratio where if > 1 then the environment as expresed by the crops performance was good Notes: As a whole, the relation is weak, however, yield performance of CA (CA:CONV ratio) start to see more very bad scores for relatively poor environments (EI<1)
library(dplyr)
# Step 1: Make sure relevant columns exist
df <- ca_AC_clean %>%
filter(!is.na(DC_Yield_t_ha), !is.na(Treat_code), !is.na(block_season), !is.na(IQR_Season), !is.na(IQF_environment))
# Step 2: Compute average yield per Treat_code within IQR_Season + IQF_environment
ref_yields <- df %>%
group_by(Treat_code, IQR_Season, IQF_environment) %>%
summarise(avg_env_yield = mean(DC_Yield_t_ha, na.rm = TRUE), .groups = "drop")
# Step 3: Join reference yields back to main data
df_with_ref <- df %>%
left_join(ref_yields, by = c("Treat_code", "IQR_Season", "IQF_environment")) %>%
mutate(
yield_ratio_env = DC_Yield_t_ha / avg_env_yield
)
# Step 4: Compute average of those ratios per block-season
env_index_df <- df_with_ref %>%
group_by(block_season) %>%
summarise(env_index = mean(yield_ratio_env, na.rm = TRUE), .groups = "drop")
# Step 5: View result
head(env_index_df)
## # A tibble: 6 × 2
## block_season env_index
## <chr> <dbl>
## 1 102_10_23A 0.721
## 2 102_10_23B 0.816
## 3 102_10_24A 1.12
## 4 102_10_24B 0.862
## 5 102_10_25A 1.26
## 6 102_10_25B 1.18
# Then we paste it into CONV_CA_Ratio
# Merge environment index into CONV_CA_Ratio by block_season
CONV_CA_Ratio <- CONV_CA_Ratio %>%
left_join(env_index_df, by = "block_season")
# View result
head(CONV_CA_Ratio)
## # A tibble: 6 × 105
## IQR_Season Seas_block_treat IQR_SeasAB `Sample code` IQF_agzone IQF_region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 23A 23A79_15C A 23A_i_P2_79_15_T Mayaga-Bug… S
## 2 23A 23A58_10C A 23A_i_P2_58_10_T Central Pl… NW
## 3 23A 23A53_4C A 23A_i_P2_53_4_T Volcanic c… NW
## 4 23A 23A38_15C A 23A_i_P2_38_15_T Buberuka h… NE
## 5 23A 23A9_14C A 23A_i_P2_9_14_T Eastern Ri… E
## 6 23A 23A32_3C A 23A_i_P2_32_3_T Eastern Sa… NE
## # ℹ 99 more variables: IQF_District <chr>, IQF_environment <chr>,
## # IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## # IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## # IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## # IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## # ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>,
## # IQR_soil_texture <chr>, IQR_soil_color <chr>, …
library(dplyr)
library(ggplot2)
library(ggpmisc) # for equation and p-value annotation
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Registered S3 method overwritten by 'ggpmisc':
## method from
## as.character.polynomial polynom
# 1. Prepare data (only Yield Ratio, up to 2.5)
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Plot with regression + equation
ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
stat_poly_eq(
aes(label = paste(..eq.label.., ..p.value.label.., sep = "~~~")),
formula = y ~ x,
parse = TRUE,
label.x = "left", label.y = "top",
size = 3.5
) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Relationship Between Environment Index and Yield Ratio (CA / CONV)",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
## Warning: The dot-dot notation (`..eq.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(eq.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Segmented linear model
# Load required packages
library(dplyr)
library(ggplot2)
library(segmented)
## Warning: package 'segmented' was built under R version 4.3.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
library(glue)
## Warning: package 'glue' was built under R version 4.3.3
library(purrr)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Fit piecewise regression per crop, extract slopes and p-values safely
segmented_results <- plot_data %>%
group_split(crop) %>%
map_df(function(df) {
crop_name <- unique(df$crop)
# Fit linear model
lm_fit <- lm(Y_ratio ~ env_index, data = df)
# Try segmented model
seg_fit <- tryCatch(
segmented(lm_fit, seg.Z = ~ env_index, psi = median(df$env_index)),
error = function(e) NULL
)
if (!is.null(seg_fit)) {
df$fit <- fitted(seg_fit)
df$breakpoint <- seg_fit$psi[2]
# Try to extract slopes
slopes_df <- tryCatch(slope(seg_fit)$env_index, error = function(e) NULL)
if (!is.null(slopes_df) && nrow(slopes_df) >= 2) {
slope1 <- slopes_df[1, "Est."]
slope2 <- slopes_df[2, "Est."]
# Check if "p" column exists before accessing
if ("p" %in% colnames(slopes_df)) {
pval1 <- slopes_df[1, "p"]
pval2 <- slopes_df[2, "p"]
df$slope_label <- glue("Slope1 = {round(slope1, 3)} (p = {format.pval(pval1, digits = 2)})\n",
"Slope2 = {round(slope2, 3)} (p = {format.pval(pval2, digits = 2)})")
} else {
df$slope_label <- glue("Slope1 = {round(slope1, 3)}\nSlope2 = {round(slope2, 3)}")
}
} else {
df$slope_label <- "Slopes not available"
}
} else {
df$fit <- NA
df$breakpoint <- NA
df$slope_label <- "Model failed"
}
df
})
# 3. Plot with segmented line and slope annotations
ggplot(segmented_results, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
geom_line(aes(y = fit), color = "darkred", linewidth = 1) +
geom_text(
data = segmented_results %>%
group_by(crop) %>%
slice(1), # only show one label per facet
aes(x = -Inf, y = Inf, label = slope_label),
hjust = -0.05, vjust = 1.1,
inherit.aes = FALSE,
size = 3.5,
fontface = "italic"
) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Piecewise Linear Relationship Between Env Index and Yield Ratio",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
##Polinomial?
library(dplyr)
library(ggplot2)
library(broom)
library(glue)
# 1. Prepare data
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
group_by(crop) %>%
group_map(~ {
df <- .x
crop_label <- .y$crop
models <- list(
Linear = lm(Y_ratio ~ env_index, data = df),
Quadratic = lm(Y_ratio ~ poly(env_index, 2), data = df),
Cubic = lm(Y_ratio ~ poly(env_index, 3), data = df),
LOESS = loess(Y_ratio ~ env_index, data = df)
)
r2_vals <- sapply(models, function(m) {
s <- tryCatch(summary(m), error = function(e) NULL)
if (!is.null(s$r.squared)) s$r.squared else NA_real_
})
tibble(
crop = crop_label,
model = names(models),
r2 = as.numeric(r2_vals)
)
}) %>%
bind_rows() %>%
mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))
# 3. Create the base plot
p <- ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Model Shape Comparison: Yield Ratio vs Environment Index",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
# 4. Add all smooth lines
p <- p +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")
# 5. Position for R² text
r2_data <- r2_data %>%
group_by(crop) %>%
mutate(
y = 2.5 - 0.2 * (row_number() - 1),
x = min(plot_data$env_index, na.rm = TRUE) + 0.05
)
# 6. Add R² labels to plot
p + geom_text(
data = r2_data,
aes(x = x, y = y, label = r2_label),
inherit.aes = FALSE,
hjust = 0,
size = 3.5,
fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
##Now with profit
library(dplyr)
library(ggplot2)
library(broom)
library(glue)
# 1. Prepare data (limit extreme profit ratios)
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Profit_ratio),
is.finite(env_index),
Profit_ratio <= 5, # optional: limit outliers
Profit_ratio >= -5
)
# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
group_by(crop) %>%
group_map(~ {
df <- .x
crop_label <- .y$crop
models <- list(
Linear = lm(Profit_ratio ~ env_index, data = df),
Quadratic = lm(Profit_ratio ~ poly(env_index, 2), data = df),
Cubic = lm(Profit_ratio ~ poly(env_index, 3), data = df),
LOESS = loess(Profit_ratio ~ env_index, data = df)
)
r2_vals <- sapply(models, function(m) {
s <- tryCatch(summary(m), error = function(e) NULL)
if (!is.null(s$r.squared)) s$r.squared else NA_real_
})
tibble(
crop = crop_label,
model = names(models),
r2 = as.numeric(r2_vals)
)
}) %>%
bind_rows() %>%
mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))
# 3. Create base plot
p <- ggplot(plot_data, aes(x = env_index, y = Profit_ratio)) +
geom_point(alpha = 0.4, size = 2) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(-5, 5)) +
labs(
title = "Model Shape Comparison: Profit Ratio vs Environment Index",
x = "Environment Index",
y = "Profit Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
# 4. Add all smooth trendlines
p <- p +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")
# 5. Position R² labels
r2_data <- r2_data %>%
group_by(crop) %>%
mutate(
y = 5 - 0.4 * (row_number() - 1),
x = min(plot_data$env_index, na.rm = TRUE) + 0.05
)
# 6. Add R² labels
p + geom_text(
data = r2_data,
aes(x = x, y = y, label = r2_label),
inherit.aes = FALSE,
hjust = 0,
size = 3.5,
fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
NOtes: NO, it seems that there is actually not a general trend for yield imperovement (for each crops separately) along the 3 years. However, there is yes plots where there is a positive trend; more for beans than for maize. This may be related to the fact that initially the cover crop did lots of competition with the beans and was corrected a little each season to reduce it..???
library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)
# -----------------------------
# 1. Prepare data
# -----------------------------
trend_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
year %in% c("23", "24", "25"),
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
year = as.numeric(year),
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
)
# -----------------------------
# 2. Function to compute slope counts
# -----------------------------
get_slope_label <- function(df) {
# One slope per block
slopes <- df %>%
group_by(IQR_block) %>%
filter(n_distinct(year) >= 2) %>%
summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
mutate(
direction = case_when(
slope > 0 ~ "↑ Positive",
slope < 0 ~ "↓ Negative",
TRUE ~ "= Flat"
)
) %>%
count(direction) %>%
complete(direction, fill = list(n = 0))
# Format label text
paste0(
"↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
"↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
"= Flat: ", slopes$n[slopes$direction == "= Flat"]
)
}
# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))
# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)
# -----------------------------
# 3. MAIZE plot
# -----------------------------
p_maize <- ggplot(
trend_data %>% filter(crop_type == "Maize"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "steelblue",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_maize,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Block-level and Overall Linear Trends of Y_ratio (Maize)",
x = "Year",
y = "Relative Yield (Y_ratio)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_maize,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 4. BEANS plot
# -----------------------------
p_beans <- ggplot(
trend_data %>% filter(crop_type == "Beans"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "darkorange",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_beans,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Block-level and Overall Linear Trends of Y_ratio (Beans)",
x = "Year",
y = "Relative Yield (Y_ratio)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_beans,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 5. Print plots
# -----------------------------
print(p_maize)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
print(p_beans)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## For a farm, the trend for maize is same as for beans? NOte: There is
not a clear relation among the trent in yield for maize and beans (that
is the trend of yield along seasons is almost independent among crops
for the same farm)
library(dplyr)
library(ggplot2)
library(tidyr)
# STEP 1: Calculate slopes per crop per block
slope_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
year %in% c("23", "24", "25"),
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
year = as.numeric(year),
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
) %>%
group_by(IQR_block, crop_type) %>%
filter(n_distinct(year) >= 2) %>%
summarise(
slope = coef(lm(Y_ratio ~ year))[2],
.groups = "drop"
) %>%
pivot_wider(
names_from = crop_type,
values_from = slope
) %>%
filter(!is.na(Maize) & !is.na(Beans))
# STEP 2: Classify quadrants
slope_data <- slope_data %>%
mutate(
quadrant = case_when(
Maize > 0 & Beans > 0 ~ "Q1: ↑ Maize & ↑ Beans",
Maize < 0 & Beans > 0 ~ "Q2: ↓ Maize & ↑ Beans",
Maize < 0 & Beans < 0 ~ "Q3: ↓ Maize & ↓ Beans",
Maize > 0 & Beans < 0 ~ "Q4: ↑ Maize & ↓ Beans",
TRUE ~ "Other"
)
)
# STEP 3: Count blocks per quadrant
quad_counts <- slope_data %>%
count(quadrant) %>%
filter(quadrant != "Other")
# STEP 4: Coordinates for text labels per quadrant
label_positions <- tibble::tibble(
quadrant = c("Q1: ↑ Maize & ↑ Beans", "Q2: ↓ Maize & ↑ Beans",
"Q3: ↓ Maize & ↓ Beans", "Q4: ↑ Maize & ↓ Beans"),
x = c( max(slope_data$Maize, na.rm = TRUE) * 0.85,
min(slope_data$Maize, na.rm = TRUE) * 0.85,
min(slope_data$Maize, na.rm = TRUE) * 0.85,
max(slope_data$Maize, na.rm = TRUE) * 0.85),
y = c( max(slope_data$Beans, na.rm = TRUE) * 0.85,
max(slope_data$Beans, na.rm = TRUE) * 0.85,
min(slope_data$Beans, na.rm = TRUE) * 0.85,
min(slope_data$Beans, na.rm = TRUE) * 0.85)
) %>%
left_join(quad_counts, by = "quadrant") %>%
mutate(label = paste0(quadrant, "\nN = ", n))
# STEP 5: Final plot
ggplot(slope_data, aes(x = Maize, y = Beans)) +
geom_point(alpha = 0.7, size = 2.5, color = "purple") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted") +
geom_text(data = label_positions,
aes(x = x, y = y, label = label),
size = 4.2, hjust = 0.5, vjust = 0.5, fontface = "bold") +
labs(
title = "Comparison of Yield Trends (Slopes) per Block",
subtitle = "Each point is one IQR_block — comparing Maize vs Beans trend slopes",
x = "Maize trend slope",
y = "Beans trend slope"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)
Note: Effectes of the 3 factors are significant and also the interactions. So the CA vs. CONV comparison needs to be opened and evaluated by season-environment combination
fit = lmer(DC_Yield_t_ha~IQR_Season + IQF_environment + Treat_code +
IQR_Season:IQF_environment + IQF_environment:Treat_code + IQR_Season:Treat_code+
IQR_Season:IQF_environment:Treat_code+
(1|IQR_block),
data = ca_AC_clean)
summary(fit)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## DC_Yield_t_ha ~ IQR_Season + IQF_environment + Treat_code + IQR_Season:IQF_environment +
## IQF_environment:Treat_code + IQR_Season:Treat_code + IQR_Season:IQF_environment:Treat_code +
## (1 | IQR_block)
## Data: ca_AC_clean
##
## REML criterion at convergence: 18145.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1423 -0.5809 -0.0357 0.5168 3.8186
##
## Random effects:
## Groups Name Variance Std.Dev.
## IQR_block (Intercept) 0.7352 0.8574
## Residual 1.1814 1.0869
## Number of obs: 5654, groups: IQR_block, 567
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 5.07265 0.11767
## IQR_Season23B -3.89347 0.14310
## IQR_Season24A -0.57716 0.14608
## IQR_Season24B -3.43020 0.14315
## IQR_Season25A -1.31273 0.30002
## IQR_Season25B -3.65663 0.14737
## IQF_environmentMid_Low_Dry 0.50789 0.14817
## IQF_environmentMid_Low_Wet -0.57116 0.15895
## IQF_environmentTransition -0.33084 0.17251
## Treat_codeC -0.58904 0.13926
## IQR_Season23B:IQF_environmentMid_Low_Dry 0.31837 0.18685
## IQR_Season24A:IQF_environmentMid_Low_Dry 0.92759 0.19113
## IQR_Season24B:IQF_environmentMid_Low_Dry -0.41498 0.18889
## IQR_Season25A:IQF_environmentMid_Low_Dry -0.28488 0.32428
## IQR_Season25B:IQF_environmentMid_Low_Dry 0.15796 0.19279
## IQR_Season23B:IQF_environmentMid_Low_Wet 0.39614 0.19425
## IQR_Season24A:IQF_environmentMid_Low_Wet 0.28734 0.19991
## IQR_Season24B:IQF_environmentMid_Low_Wet -0.02848 0.19666
## IQR_Season25A:IQF_environmentMid_Low_Wet 1.18206 0.33027
## IQR_Season25B:IQF_environmentMid_Low_Wet 0.26227 0.20164
## IQR_Season23B:IQF_environmentTransition 0.75971 0.21271
## IQR_Season24A:IQF_environmentTransition 0.34742 0.21956
## IQR_Season24B:IQF_environmentTransition 0.67783 0.21652
## IQR_Season25A:IQF_environmentTransition 1.17576 0.33871
## IQR_Season25B:IQF_environmentTransition 0.75964 0.22011
## IQF_environmentMid_Low_Dry:Treat_codeC -0.12676 0.18153
## IQF_environmentMid_Low_Wet:Treat_codeC 0.02037 0.18881
## IQF_environmentTransition:Treat_codeC 0.10068 0.20647
## IQR_Season23B:Treat_codeC 0.37611 0.20078
## IQR_Season24A:Treat_codeC -1.23510 0.20469
## IQR_Season24B:Treat_codeC -0.06783 0.20058
## IQR_Season25A:Treat_codeC -0.15221 0.40874
## IQR_Season25B:Treat_codeC 0.29254 0.20575
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC -0.24892 0.26193
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 1.01731 0.26749
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.35035 0.26445
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC -0.44379 0.43993
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.04390 0.26910
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC -0.07430 0.27266
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 1.00901 0.28010
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.19780 0.27563
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC -0.16033 0.44962
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.06295 0.28196
## IQR_Season23B:IQF_environmentTransition:Treat_codeC -0.19715 0.29854
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 0.75924 0.30740
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 0.04022 0.30248
## IQR_Season25A:IQF_environmentTransition:Treat_codeC -0.27436 0.46167
## IQR_Season25B:IQF_environmentTransition:Treat_codeC -0.17918 0.30722
## df t value
## (Intercept) 3946.84230 43.108
## IQR_Season23B 5007.77869 -27.209
## IQR_Season24A 5014.15153 -3.951
## IQR_Season24B 5029.94067 -23.963
## IQR_Season25A 5095.20364 -4.375
## IQR_Season25B 5036.77759 -24.812
## IQF_environmentMid_Low_Dry 5265.71529 3.428
## IQF_environmentMid_Low_Wet 4244.04672 -3.593
## IQF_environmentTransition 4466.82592 -1.918
## Treat_codeC 4934.68676 -4.230
## IQR_Season23B:IQF_environmentMid_Low_Dry 5024.75069 1.704
## IQR_Season24A:IQF_environmentMid_Low_Dry 5028.40349 4.853
## IQR_Season24B:IQF_environmentMid_Low_Dry 5038.70327 -2.197
## IQR_Season25A:IQF_environmentMid_Low_Dry 5152.64948 -0.879
## IQR_Season25B:IQF_environmentMid_Low_Dry 5044.03679 0.819
## IQR_Season23B:IQF_environmentMid_Low_Wet 5004.70035 2.039
## IQR_Season24A:IQF_environmentMid_Low_Wet 5014.82054 1.437
## IQR_Season24B:IQF_environmentMid_Low_Wet 5023.55760 -0.145
## IQR_Season25A:IQF_environmentMid_Low_Wet 5151.71514 3.579
## IQR_Season25B:IQF_environmentMid_Low_Wet 5029.47221 1.301
## IQR_Season23B:IQF_environmentTransition 5024.22535 3.572
## IQR_Season24A:IQF_environmentTransition 5036.14781 1.582
## IQR_Season24B:IQF_environmentTransition 5048.34564 3.131
## IQR_Season25A:IQF_environmentTransition 5140.32873 3.471
## IQR_Season25B:IQF_environmentTransition 5049.16391 3.451
## IQF_environmentMid_Low_Dry:Treat_codeC 4934.73832 -0.698
## IQF_environmentMid_Low_Wet:Treat_codeC 4934.43099 0.108
## IQF_environmentTransition:Treat_codeC 4936.59168 0.488
## IQR_Season23B:Treat_codeC 4934.54897 1.873
## IQR_Season24A:Treat_codeC 4933.26541 -6.034
## IQR_Season24B:Treat_codeC 4934.27215 -0.338
## IQR_Season25A:Treat_codeC 4932.21882 -0.372
## IQR_Season25B:Treat_codeC 4933.17413 1.422
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 4934.36696 -0.950
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 4933.62520 3.803
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.92198 1.325
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 4932.37895 -1.009
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.18903 0.163
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.13254 -0.272
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 4933.57546 3.602
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.53452 0.718
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 4932.40823 -0.357
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 4933.03230 0.223
## IQR_Season23B:IQF_environmentTransition:Treat_codeC 4936.75531 -0.660
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 4934.11571 2.470
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 4935.72560 0.133
## IQR_Season25A:IQF_environmentTransition:Treat_codeC 4932.90235 -0.594
## IQR_Season25B:IQF_environmentTransition:Treat_codeC 4934.01727 -0.583
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## IQR_Season23B < 2e-16 ***
## IQR_Season24A 7.89e-05 ***
## IQR_Season24B < 2e-16 ***
## IQR_Season25A 1.24e-05 ***
## IQR_Season25B < 2e-16 ***
## IQF_environmentMid_Low_Dry 0.000613 ***
## IQF_environmentMid_Low_Wet 0.000330 ***
## IQF_environmentTransition 0.055199 .
## Treat_codeC 2.38e-05 ***
## IQR_Season23B:IQF_environmentMid_Low_Dry 0.088461 .
## IQR_Season24A:IQF_environmentMid_Low_Dry 1.25e-06 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry 0.028071 *
## IQR_Season25A:IQF_environmentMid_Low_Dry 0.379705
## IQR_Season25B:IQF_environmentMid_Low_Dry 0.412614
## IQR_Season23B:IQF_environmentMid_Low_Wet 0.041471 *
## IQR_Season24A:IQF_environmentMid_Low_Wet 0.150693
## IQR_Season24B:IQF_environmentMid_Low_Wet 0.884848
## IQR_Season25A:IQF_environmentMid_Low_Wet 0.000348 ***
## IQR_Season25B:IQF_environmentMid_Low_Wet 0.193442
## IQR_Season23B:IQF_environmentTransition 0.000358 ***
## IQR_Season24A:IQF_environmentTransition 0.113631
## IQR_Season24B:IQF_environmentTransition 0.001754 **
## IQR_Season25A:IQF_environmentTransition 0.000522 ***
## IQR_Season25B:IQF_environmentTransition 0.000563 ***
## IQF_environmentMid_Low_Dry:Treat_codeC 0.485007
## IQF_environmentMid_Low_Wet:Treat_codeC 0.914091
## IQF_environmentTransition:Treat_codeC 0.625834
## IQR_Season23B:Treat_codeC 0.061097 .
## IQR_Season24A:Treat_codeC 1.72e-09 ***
## IQR_Season24B:Treat_codeC 0.735259
## IQR_Season25A:Treat_codeC 0.709621
## IQR_Season25B:Treat_codeC 0.155149
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 0.341986
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 0.000145 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.185282
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 0.313132
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.870420
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 0.785259
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 0.000319 ***
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.473017
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 0.721415
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.823344
## IQR_Season23B:IQF_environmentTransition:Treat_codeC 0.509028
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 0.013548 *
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 0.894235
## IQR_Season25A:IQF_environmentTransition:Treat_codeC 0.552351
## IQR_Season25B:IQF_environmentTransition:Treat_codeC 0.559773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 48 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
anova(fit)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value
## IQR_Season 10906.4 2181.29 5 5090.1 1846.3812
## IQF_environment 144.2 48.06 3 1584.7 40.6770
## Treat_code 456.2 456.21 1 4932.1 386.1684
## IQR_Season:IQF_environment 572.7 38.18 15 5134.1 32.3174
## IQF_environment:Treat_code 10.0 3.32 3 4935.8 2.8144
## IQR_Season:Treat_code 106.8 21.36 5 4932.0 18.0778
## IQR_Season:IQF_environment:Treat_code 42.0 2.80 15 4933.6 2.3681
## Pr(>F)
## IQR_Season < 2.2e-16 ***
## IQF_environment < 2.2e-16 ***
## Treat_code < 2.2e-16 ***
## IQR_Season:IQF_environment < 2.2e-16 ***
## IQF_environment:Treat_code 0.037797 *
## IQR_Season:Treat_code < 2.2e-16 ***
## IQR_Season:IQF_environment:Treat_code 0.002128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load required libraries
library(dplyr)
library(ggplot2)
# 1. Filter and prepare main data
plot_data <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio), Y_ratio <= 5)
# 2. Compute p-values, significance, and means
sig_labels <- plot_data %>%
group_by(IQR_Season, IQF_environment, crop) %>%
summarise(
mean_y = mean(Y_ratio, na.rm = TRUE),
p_value = tryCatch(t.test(Y_ratio, mu = 1)$p.value, error = function(e) NA_real_),
.groups = "drop"
) %>%
mutate(
signif = case_when(
is.na(p_value) ~ "",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
),
mean_label = round(mean_y, 2) # Rounded mean
)
# 3. Get max Y for each group to place stars
y_pos <- plot_data %>%
group_by(IQR_Season, IQF_environment, crop) %>%
summarise(y_max = max(Y_ratio, na.rm = TRUE), .groups = "drop")
# 4. Merge label + position
sig_labels <- left_join(sig_labels, y_pos, by = c("IQR_Season", "IQF_environment", "crop"))
# 5. Plot
ggplot(plot_data, aes(x = IQR_Season, y = Y_ratio, color = crop)) +
geom_boxplot(outlier.alpha = 0.3) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
facet_wrap(~ IQF_environment) +
# Add significance stars
geom_text(
data = sig_labels,
aes(x = IQR_Season, y = y_max + 0.15, label = signif, group = crop),
inherit.aes = FALSE,
size = 5,
fontface = "bold"
) +
# Add mean values just above the stars
geom_text(
data = sig_labels,
aes(x = IQR_Season, y = y_max + 0.7, label = mean_label, group = crop),
inherit.aes = FALSE,
size = 3.5,
fontface = "plain",
color = "black"
) +
labs(
title = "Yield Ratio (CA / CONV) by Season and Environment",
x = "Season",
y = "Yield Ratio"
) +
ylim(0, 5.7) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
### Distribution of the residuals from mix model Although Shapiro–Wilk
tests indicated statistically significant deviations from normality (p
< 0.001), that my be because of the large sample size; W=0.99 the
residuals are close to normality and Q–Q plots showed no substantial
departures from normality, indicating that model assumptions were
adequately met.
library(broom.mixed)
library(dplyr)
resid_df <- augment(fit) %>%
mutate(
resid = .resid,
fitted = .fitted
)
library(ggplot2)
ggplot(resid_df, aes(x = resid)) +
geom_histogram(aes(y = after_stat(density)),
bins = 40,
fill = "grey80",
color = "black") +
geom_density(color = "blue", linewidth = 1) +
labs(
title = "Distribution of residuals",
x = "Residuals",
y = "Density"
) +
theme_minimal()
ggplot(resid_df, aes(sample = resid)) +
stat_qq(color = "black") +
stat_qq_line(color = "red", linewidth = 1) +
labs(
title = "Q–Q plot of residuals"
) +
theme_minimal()
ggplot(resid_df, aes(x = fitted, y = resid)) +
geom_point(alpha = 0.4) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs fitted values",
x = "Fitted values",
y = "Residuals"
) +
theme_minimal()
### Test of residuals H0 rejected but because we have large N, the W is
actually very close to 1…so its quite normal
set.seed(123)
shapiro.test(sample(resid_df$resid, 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(resid_df$resid, 5000)
## W = 0.98611, p-value < 2.2e-16
Mean is 0 and symetric distribution of the residual of the random effects
qqnorm(ranef(fit)$IQR_block[[1]])
qqline(ranef(fit)$IQR_block[[1]], col = "red")
To describe how farms perform under CA over time, we created a typology of block-level responses using their relative yield (CA / CONV) across six seasons (23A–25B). Only blocks with data in at least four seasons were retained to ensure stable classification.
For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85
These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.
Notes: Average performance of CA is very bad, even if we target the environmetns or the districts with best performance, this blanket reccomednation would fail baddly. However,if we would be able to target the plots with good performance.. the rate of success of the reccomendatio could be high…for aprox 20% of the plots (i.e., 35% of the farmers…)
library(dplyr)
library(tidyr)
# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")
# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
group_by(IQR_block) %>%
summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
filter(n_valid_seasons >= 4)
# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
IQR_Season %in% seasons,
is.finite(Y_ratio))
# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
group_by(IQR_block, IQR_Season) %>%
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = IQR_Season,
values_from = Y_ratio,
names_prefix = "Y_ratio_"
)
# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
group_by(IQR_block) %>%
summarise(
n_seasons = n_distinct(IQR_Season),
any_below_0.95 = any(Y_ratio < 0.95),
mean_ratio = mean(Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
CA_typology = case_when(
!any_below_0.95 ~ "Good",
any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
mean_ratio <= 0.85 ~ "Very Poor"
)
)
# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>% # Prioritize 23A > 24A > 25A
distinct(IQR_block, .keep_all = TRUE) %>%
dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)
# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
left_join(RY_wide, by = "IQR_block") %>%
left_join(block_meta, by = "IQR_block")
Now we analyse the frequency of each type
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_environment) +
labs(
title = "CA-Performance Typology Distribution by OAF-AEZ",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
## What if we target the districts with the best performnace? wht could
be the rate of success? Notes: Yes better chance but it would still fail
in all
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_District) +
labs(
title = "CA-Performance Typology Distribution by District",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
## Another way to see it
library(dplyr)
library(ggplot2)
library(glue)
# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio)) %>%
group_by(IQR_block, IQF_District) %>%
summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 2. Summary for annotations
district_summary <- block_means %>%
group_by(IQF_District) %>%
summarise(
avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
n_total = n(),
n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
percent_ge1 = round(100 * n_ge1 / n_total)
) %>%
mutate(
label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
y_pos = 1.5
)
# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "gray90") +
geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
geom_text(
data = district_summary,
aes(x = IQF_District, y = y_pos, label = label),
inherit.aes = FALSE,
size = 3.5,
vjust = 0
) +
labs(
title = "Distribution of Mean Y_ratio per Block by District",
x = "District",
y = "Mean Y_ratio per Block (across seasons)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
# Drivers of yield performance We start with the database of yield
drivers that was already processed in the scrips: “data_mm_block” but we
change the relative
data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")
## Rows: 473 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_te...
## dbl (8): ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, ICR_arable_land_avg...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drivers_data <- data_mm_block %>%
mutate(
CA_typology = fct_drop(CA_typology),
IQR_block = factor(IQR_block),
IQR_TF_tubura_client = factor(IQR_TF_tubura_client),
IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
IQR_soil_texture = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
IQR_soil_color = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
IQF_position_slope = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
)
str(drivers_data)
## tibble [473 × 17] (S3: tbl_df/tbl/data.frame)
## $ IQR_block : Factor w/ 473 levels "10_1","10_10",..: 15 16 17 18 19 20 21 22 23 24 ...
## $ ICR_Org_C_avg : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
## $ ICR_K_perc_23A : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
## $ ICR_Avail_P_avg : num [1:473] 11.7 11.5 22 60.6 35.6 ...
## $ ICR_arable_land_avg : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
## $ ICF_Alt_avg : num [1:473] 1636 1293 1141 1207 1145 ...
## $ ICR_rainfall_avg : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
## $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
## $ ICF_planting_date : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
## $ IQR_TF_tubura_client : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
## $ IQR_plot_fert_quality : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
## $ IQR_soil_texture : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ IQR_soil_color : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
## $ IQF_position_slope : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
## $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ CA_typology : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
## $ IQF_environment : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...
# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)
# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
dplyr::group_by(IQR_block, IQR_Season) |>
dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
dplyr::group_by(IQR_block) |>
dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |>
dplyr::left_join(avg_Y_ratio, by = "IQR_block")