#Libraries and Data upload
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats 1.0.0 âś” tibble 3.2.1
## âś” purrr 1.0.2 âś” tidyr 1.3.1
## âś” stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 11921 Columns: 94
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (54): IQR_Season, Seas_block_treat, IQR_SeasAB, Sample code, IQF_agzone,...
## dbl (39): ICF_Lat, ICF_Long, ICF_Alt, ICR_OM_perc, ICR_landprep_pers_usd_ha,...
## num (1): DC_cost_usd_ha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(dplyr)
ca_raw <- ca_raw %>%
mutate(
IQF_environment = case_when(
IQF_environment %in% c("Mid Low Dry", "Mid_Low_Dry") ~ "Mid_Low_Dry",
IQF_environment %in% c("Mid Low Wet", "Mid_Low_Wet") ~ "Mid_Low_Wet",
TRUE ~ IQF_environment
)
)
Some numeric columns are read as characters. We’ll define two
cleaning functions: 1. convert_numeric_chars() - converts
any character column to numeric if all values are numeric. 2.
clean_numeric_columns() - replaces invalid strings
(e.g. “na”, “NA”) and converts to numeric if most values are numbers. ##
Numeric/factoruian check
convert_numeric_chars <- function(df) {
df %>%
mutate(across(where(is.character), ~ {
if (all(grepl("^[-]?[0-9]*\\.?[0-9]+$", .x[!is.na(.x)]))) {
as.numeric(.x)
} else {
.x
}
}))
}
# Apply it
ca_raw <- convert_numeric_chars(ca_raw)
# Function to convert character columns to numeric when appropriate
clean_numeric_columns <- function(df) {
df <- df %>%
mutate(across(where(is.character), function(x) {
# Replace "NA", "na", "" with real NA
x[x %in% c("NA", "na", "")] <- NA
# Try converting to numeric
x_num <- suppressWarnings(as.numeric(x))
# If most non-NA values are numeric, keep it
if (sum(!is.na(x_num)) > 0.9 * sum(!is.na(x))) {
return(x_num)
} else {
return(x)
}
}))
return(df)
}
# Apply it to ca_raw
ca_raw <- clean_numeric_columns(ca_raw)
# Check summary of converted classes
table(sapply(ca_raw, class))
##
## character numeric
## 39 55
library(dplyr)
library(tidyr)
summary_farmers_raw <- ca_raw %>%
filter(!is.na(IQR_Season), !is.na(IQR_block)) %>%
group_by(IQR_Season) %>%
summarise(
# total farmers (no yield / treatment conditions)
total_farmers = n_distinct(IQR_block),
# farmers who have BOTH A and C with yield not NA and not 0
farmers_with_A_and_C = n_distinct(IQR_block[
Treat_code %in% c("A","C") &
!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
IQR_block, FUN = any) &
ave(Treat_code %in% c("A","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="C",
IQR_block, FUN = any)
]),
# farmers who have A AND (B OR C) with yield not NA and not 0
farmers_with_A_and_BorC = n_distinct(IQR_block[
Treat_code %in% c("A","B","C") &
!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 &
ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code=="A",
IQR_block, FUN = any) &
ave(Treat_code %in% c("A","B","C") & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0 & Treat_code %in% c("B","C"),
IQR_block, FUN = any)
]),
.groups = "drop"
) %>%
mutate(
farmers_with_A_and_C = replace_na(farmers_with_A_and_C, 0),
farmers_with_A_and_BorC = replace_na(farmers_with_A_and_BorC, 0),
pct_with_A_and_C = round(100 * farmers_with_A_and_C / total_farmers, 1),
pct_with_A_and_BorC = round(100 * farmers_with_A_and_BorC / total_farmers, 1)
)
summary_farmers_raw
## # A tibble: 6 Ă— 6
## IQR_Season total_farmers farmers_with_A_and_C farmers_with_A_and_BorC
## <chr> <int> <int> <int>
## 1 23A 580 559 559
## 2 23B 579 506 507
## 3 24A 495 469 471
## 4 24B 477 472 472
## 5 25A 457 441 442
## 6 25B 449 441 442
## # ℹ 2 more variables: pct_with_A_and_C <dbl>, pct_with_A_and_BorC <dbl>
#Data cleaning ## Clean for only farm-season with A and C, and that they are not 0 or NA
library(dplyr)
ca_raw <- ca_raw %>%
mutate(
IQF_SeasonblockID = if_else(
is.na(IQF_SeasonblockID),
paste(IQR_block, IQR_Season, sep = "_"),
IQF_SeasonblockID
)
)
ca_TATC <- ca_raw %>%
# keep only A and C rows
filter(Treat_code %in% c("A", "C")) %>%
# group by the paired unit
group_by(IQF_SeasonblockID) %>%
# keep only blocks where BOTH A and C exist
filter(all(c("A", "C") %in% Treat_code)) %>%
# and BOTH have valid (non-NA, non-zero) yields
filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha != 0)) %>%
ungroup()
ca_TATC
## # A tibble: 5,776 Ă— 94
## IQR_Season Seas_block_treat IQR_SeasAB `Sample code` IQF_agzone IQF_region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 23A 23A79_15C A 23A_i_P2_79_15_T Mayaga-Bu… S
## 2 23A 23A79_15A A 23A_i_P2_79_15_T Mayaga-Bu… S
## 3 23A 23A58_10A A 23A_i_P2_58_10_T Central P… NW
## 4 23A 23A58_10C A 23A_i_P2_58_10_T Central P… NW
## 5 23A 23A53_4C A 23A_i_P2_53_4_T Volcanic … NW
## 6 23A 23A53_4A A 23A_i_P2_53_4_T Volcanic … NW
## 7 23A 23A38_15A A 23A_i_P2_38_15_T Buberuka … NE
## 8 23A 23A38_15C A 23A_i_P2_38_15_T Buberuka … NE
## 9 23A 23A9_14A A 23A_i_P2_9_14_T Eastern R… E
## 10 23A 23A9_14C A 23A_i_P2_9_14_T Eastern R… E
## # ℹ 5,766 more rows
## # ℹ 88 more variables: IQF_District <chr>, IQF_environment <chr>,
## # IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## # IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## # IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## # IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## # ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>, …
ca_TATC %>%
count(IQF_SeasonblockID)
## # A tibble: 2,888 Ă— 2
## IQF_SeasonblockID n
## <chr> <int>
## 1 102_10_23A 2
## 2 102_10_23B 2
## 3 102_10_24A 2
## 4 102_10_24B 2
## 5 102_10_25A 2
## 6 102_10_25B 2
## 7 102_11_23A 2
## 8 102_11_23B 2
## 9 102_11_24A 2
## 10 102_11_24B 2
## # ℹ 2,878 more rows
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(broom.mixed)
## Warning: package 'broom.mixed' was built under R version 4.3.3
library(dplyr)
model_data <- ca_TATC %>%
filter(
!is.na(DC_Yield_t_ha),
DC_Yield_t_ha != 0,
!is.na(Treat_code),
!is.na(IQR_Season),
!is.na(IQF_environment),
!is.na(IQR_block)
) %>%
mutate(
DC_Yield_t_ha = as.numeric(DC_Yield_t_ha),
Treat_code = factor(Treat_code),
IQR_Season = factor(IQR_Season),
IQF_environment = factor(IQF_environment),
IQR_block = factor(IQR_block)
)
###Linear mixed model for outlier detection
model_resid <- lmer(
DC_Yield_t_ha ~ Treat_code * IQR_Season +
(1 | IQR_Season) +
(1 | IQF_environment) +
(1 | IQR_block),
data = model_data,
REML = TRUE
)
resid_data <- augment(model_resid, data = model_data) %>%
mutate(
std_resid = as.numeric(scale(.resid)),
is_outlier = abs(std_resid) > 3
)
outlier_counts <- resid_data %>%
filter(is_outlier) %>%
group_by(IQR_Season, Treat_code) %>%
summarise(
n_outliers = n(),
.groups = "drop"
)
print(outlier_counts)
## # A tibble: 6 Ă— 3
## IQR_Season Treat_code n_outliers
## <fct> <fct> <int>
## 1 23A A 13
## 2 23A C 4
## 3 24A A 17
## 4 24A C 7
## 5 25A A 8
## 6 25A C 7
###Inspect extreme observations
resid_data %>%
filter(is_outlier) %>%
select(
IQR_Season,
Treat_code,
IQR_block,
IQF_environment,
DC_Yield_t_ha,
std_resid
) %>%
arrange(desc(abs(std_resid)))
## # A tibble: 56 Ă— 6
## IQR_Season Treat_code IQR_block IQF_environment DC_Yield_t_ha std_resid
## <fct> <fct> <fct> <fct> <dbl> <dbl>
## 1 24A A 53_4 Highland 18.1 8.22
## 2 24A A 53_3 Highland 19.4 8.14
## 3 24A A 53_5 Highland 16.9 7.85
## 4 23A A 17_6 Mid_Low_Dry 16.2 7.27
## 5 24A C 53_5 Highland 14.9 7.19
## 6 24A C 53_4 Highland 15.4 7.00
## 7 24A C 53_3 Highland 15.7 6.07
## 8 23A A 17_1 Mid_Low_Dry 16.7 6.07
## 9 24A A 49_2 Highland 14.4 5.98
## 10 24A A 53_1 Highland 14.1 5.15
## # ℹ 46 more rows
library(dplyr)
library(ggplot2)
# Prepare base plotting data
plot_data <- ca_TATC %>%
filter(
!is.na(DC_Yield_t_ha),
DC_Yield_t_ha != 0,
!is.na(Treat_code),
!is.na(IQR_Season),
!is.na(IQF_agzone)
) %>%
mutate(
Treat_code = factor(Treat_code),
IQR_Season = factor(IQR_Season),
IQF_agzone = factor(IQF_agzone)
)
# Join LMM outlier flag
plot_data_lmm <- plot_data %>%
left_join(
resid_data %>%
select(
IQR_Season,
Treat_code,
IQR_block,
is_outlier_lmm = is_outlier
),
by = c("IQR_Season", "Treat_code", "IQR_block")
) %>%
mutate(
is_outlier_lmm = ifelse(is.na(is_outlier_lmm), FALSE, is_outlier_lmm)
)
ggplot(plot_data_lmm, aes(x = Treat_code, y = DC_Yield_t_ha)) +
geom_boxplot(outlier.shape = NA, fill = "gray90") +
geom_jitter(
aes(color = is_outlier_lmm),
width = 0.2,
alpha = 0.7
) +
scale_color_manual(
values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Regular observation", "TRUE" = "Model outlier")
) +
facet_grid(IQF_agzone ~ IQR_Season, scales = "free_y") +
labs(
title = "Yield (t/ha) by Treatment and Season",
subtitle = "Red points indicate outliers detected by linear mixed-effects model (|std. resid| > 4)",
x = "Treatment Code",
y = "Yield (t/ha)",
color = ""
) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.text = element_text(size = 9)
)
library(dplyr)
threshold <- 0.1
block_season_flags <- ca_TATC %>%
filter(!is.na(IQF_SeasonblockID), !is.na(Treat_code)) %>%
mutate(DC_Yield_t_ha = as.numeric(DC_Yield_t_ha)) %>%
group_by(IQF_SeasonblockID, IQR_Season) %>%
summarise(
# both A and C exist AND both are < 0.1
AC_both_lt0.1 =
any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) &
any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),
# A OR B is < 0.1 (either one, if present)
AorB_lt0.1 =
any(Treat_code == "A" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold) |
any(Treat_code == "C" & !is.na(DC_Yield_t_ha) & DC_Yield_t_ha < threshold),
.groups = "drop"
)
# ---- Overall counts (unique block-season combinations) ----
overall_counts <- block_season_flags %>%
summarise(
total_block_season = n(),
n_AC_lt0.1 = sum(AC_both_lt0.1),
n_AorB_lt0.1 = sum(AorB_lt0.1),
prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3)
)
overall_counts
## # A tibble: 1 Ă— 5
## total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1 prop_AorB_lt0.1
## <int> <int> <int> <dbl> <dbl>
## 1 2888 1 14 0 0.005
# ---- By season (optional) ----
by_season_counts <- block_season_flags %>%
group_by(IQR_Season) %>%
summarise(
total_block_season = n(),
n_AC_lt0.1 = sum(AC_both_lt0.1),
n_AorB_lt0.1 = sum(AorB_lt0.1),
prop_AC_lt0.1 = round(mean(AC_both_lt0.1), 3),
prop_AorB_lt0.1 = round(mean(AorB_lt0.1), 3),
.groups = "drop"
)
by_season_counts
## # A tibble: 6 Ă— 6
## IQR_Season total_block_season n_AC_lt0.1 n_AorB_lt0.1 prop_AC_lt0.1
## <chr> <int> <int> <int> <dbl>
## 1 23A 559 0 0 0
## 2 23B 506 1 7 0.002
## 3 24A 469 0 1 0
## 4 24B 472 0 6 0
## 5 25A 441 0 0 0
## 6 25B 441 0 0 0
## # ℹ 1 more variable: prop_AorB_lt0.1 <dbl>
clean databse with no NA or <0.1 for yield and cleaned removing outliers (bothe treatments for theat season-farmer it at lease one is outlier)
library(dplyr)
# --- 1) Identify model outliers (if you already have resid_data, skip to step 2) ---
# NOTE: requires lme4, lmerTest, broom.mixed loaded and model fitted as you did
# resid_data should contain: IQR_Season, Treat_code, IQR_block, IQF_environment, .resid, is_outlier
# (from: resid_data <- augment(model_resid, data = model_data) %>% mutate(std_resid=..., is_outlier=...))
# --- 2) Drop ALL outlier blocks (balanced A/C by SeasonblockID) ---
# Rule: if either A or C is flagged as outlier within a given IQF_SeasonblockID,
# remove the entire IQF_SeasonblockID (both treatments), so pairs remain balanced.
outlier_ids <- resid_data %>%
filter(is_outlier) %>%
distinct(IQF_SeasonblockID)
## not outliers
ca_AC_clean <- ca_TATC %>%
# keep only A and C (optional, but usually desired for paired A/C analysis)
filter(Treat_code %in% c("A", "C")) %>%
# keep only complete A/C pairs with valid yields (recommended)
group_by(IQF_SeasonblockID) %>%
filter(all(c("A","C") %in% Treat_code)) %>%
filter(all(!is.na(DC_Yield_t_ha) & DC_Yield_t_ha >= 0.1)) %>%
ungroup() %>%
# drop any SeasonblockID that had an outlier in either treatment
anti_join(outlier_ids, by = "IQF_SeasonblockID")
# Quick checks
cat("Rows before:", nrow(ca_TATC), "\n")
## Rows before: 5776
cat("Rows after :", nrow(ca_AC_clean), "\n")
## Rows after : 5654
# Confirm balance: should be 2 rows (A and C) per SeasonblockID
balance_check <- ca_AC_clean %>%
count(IQF_SeasonblockID) %>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
print(balance_check)
## # A tibble: 1 Ă— 2
## n nn
## <int> <int>
## 1 2 2827
library(tidyverse)
ca_AC_clean <- ca_AC_clean %>%
mutate(
year = str_sub(IQR_Season, 1, 2),
crop_code = str_sub(IQR_Season, -1),
crop = case_when(
crop_code == "A" ~ "Maize",
crop_code == "B" ~ "Beans",
TRUE ~ NA_character_
)
)
ca_AC_clean$block_season = paste0(ca_AC_clean$IQR_block, "_", ca_AC_clean$IQR_Season)
#New database of column of yield ratio CONV_CA_Ratio
library(dplyr)
CONV_CA_Ratio <- ca_AC_clean %>%
# Convert profit column
mutate(
DC_profit_usd_ha = as.numeric(gsub(",", "", trimws(DC_profit_usd_ha)))
) %>%
# Keep only relevant treatments
filter(Treat_code %in% c("A", "C")) %>%
filter(!is.na(DC_Yield_t_ha), !is.na(DC_profit_usd_ha)) %>%
# Group by block-season
group_by(IQR_block, IQR_Season) %>%
# NEW: keep only groups with both A and C after filtering
filter(n_distinct(Treat_code) == 2) %>%
# Mutate safely
mutate(
yield_A = DC_Yield_t_ha[Treat_code == "A"],
yield_C = DC_Yield_t_ha[Treat_code == "C"],
profit_A = DC_profit_usd_ha[Treat_code == "A"],
profit_C = DC_profit_usd_ha[Treat_code == "C"],
Y_ratio = yield_C / yield_A,
Profit_ratio = profit_C / profit_A
) %>%
# Keep only one row per group (e.g., the C row)
filter(Treat_code == "C") %>%
ungroup() %>%
filter(is.finite(Y_ratio), is.finite(Profit_ratio))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DC_profit_usd_ha = as.numeric(gsub(",", "",
## trimws(DC_profit_usd_ha)))`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(tidyr)
A_vs_C_prop <- ca_AC_clean %>%
filter(Treat_code %in% c("A", "C")) %>%
select(
IQR_Season,
IQF_environment,
IQF_SeasonblockID,
Treat_code,
DC_Yield_t_ha
) %>%
pivot_wider(
names_from = Treat_code,
values_from = DC_Yield_t_ha
) %>%
filter(!is.na(A), !is.na(C)) %>%
group_by(IQR_Season, IQF_environment) %>%
summarise(
n_blocks = n(),
prop_A_greater_C = mean(A > C),
prop_C_greater_A = mean(C > A),
prop_equal = mean(A == C),
.groups = "drop"
)
A_vs_C_prop
## # A tibble: 24 Ă— 6
## IQR_Season IQF_environment n_blocks prop_A_greater_C prop_C_greater_A
## <chr> <chr> <int> <dbl> <dbl>
## 1 23A Highland 120 0.692 0.308
## 2 23A Mid_Low_Dry 173 0.728 0.266
## 3 23A Mid_Low_Wet 145 0.745 0.255
## 4 23A Transition 100 0.68 0.31
## 5 23B Highland 111 0.694 0.270
## 6 23B Mid_Low_Dry 160 0.906 0.0875
## 7 23B Mid_Low_Wet 132 0.773 0.212
## 8 23B Transition 90 0.756 0.233
## 9 24A Highland 104 0.971 0.0288
## 10 24A Mid_Low_Dry 146 0.753 0.233
## # ℹ 14 more rows
## # ℹ 1 more variable: prop_equal <dbl>
##Visual distribution of CONV advantage vs CA_Yield
library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
geom_density(alpha = 0.3) +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(title = "Density of Relative Yield (CA / CONV) by Agzone")
##Visual distribution of CONV advantage vs CA_profit
ggplot(CONV_CA_Ratio, aes(x = Profit_ratio, fill = IQF_environment)) +
geom_density(alpha = 0.5, adjust = 1.2) + # <-- smoother
geom_vline(xintercept = 1, linetype = "dashed") +
scale_fill_brewer(palette = "Pastel1") +
coord_cartesian(xlim = c(-1, 6)) +
labs(
title = "Density of Relative Profit (CA / CONV) by Agzone",
x = "Profit_ratio",
y = "Density"
) +
theme_minimal()
## Another way to see the disctribution of the advantage of CONV_Yield
Overall about 22% of the block-season combinations had CA as a similar
or better performed than CONV
library(dplyr)
library(ggplot2)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
arrange(Y_ratio) %>%
filter(is.finite(Y_ratio)) %>%
mutate(
season_crop = case_when(
IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
TRUE ~ "Other"
),
cum_percent = 100 * (row_number() / n())
)
# 2. Estimate intersection y-value at x = 1 (Y_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
group_by(IQF_environment) %>%
filter(n() >= 5) %>%
do({
tryCatch({
model <- loess(cum_percent ~ Y_ratio, data = .)
y_at_1 <- predict(model, newdata = data.frame(Y_ratio = 1))
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
}, error = function(e) {
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
})
}) %>%
ungroup()
# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")
# 4. Plot
ggplot(plot_data, aes(x = Y_ratio, y = cum_percent)) +
geom_point(aes(color = season_crop), size = 2) +
geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
geom_text(
data = intersection_lines,
aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
inherit.aes = FALSE,
color = "blue",
size = 3,
hjust = 0
) +
scale_color_manual(
values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
) +
scale_x_continuous(limits = c(0, 2.5)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap(~ IQF_environment, scales = "free_y") +
labs(
x = "Relative Yield (CA / CONV)",
y = "Cumulative % of Observations",
title = "Cumulative Frequency of Relative Yield at Block Level",
subtitle = "Dashed = 1 yield threshold; Dotted = intersect with cumulative curve",
color = "Season Crop Type"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
strip.text = element_text(face = "bold"),
legend.position = "right"
)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
##Same distribution but for profit IN terms of profit, the advantage of
CONV is a little larger
library(dplyr)
library(ggplot2)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
arrange(Profit_ratio) %>%
filter(is.finite(Profit_ratio)) %>%
mutate(
season_crop = case_when(
IQR_Season %in% c("23A", "24A", "25A") ~ "Maize season",
IQR_Season %in% c("23B", "24B", "25B") ~ "Beans season",
TRUE ~ "Other"
),
cum_percent = 100 * (row_number() / n())
)
# 2. Estimate intersection y-value at x = 1 (Profit_ratio = 1) for each IQF_environment
intersection_lines <- plot_data %>%
group_by(IQF_environment) %>%
filter(n() >= 5) %>%
do({
tryCatch({
model <- loess(cum_percent ~ Profit_ratio, data = .)
y_at_1 <- predict(model, newdata = data.frame(Profit_ratio = 1))
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = y_at_1)
}, error = function(e) {
data.frame(IQF_environment = unique(.$IQF_environment), y_cross = NA_real_)
})
}) %>%
ungroup()
# 3. Merge y_cross into plot_data
plot_data <- left_join(plot_data, intersection_lines, by = "IQF_environment")
# 4. Plot
ggplot(plot_data, aes(x = Profit_ratio, y = cum_percent)) +
geom_point(aes(color = season_crop), size = 2, alpha = 0.6) +
geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
geom_hline(aes(yintercept = y_cross), color = "blue", linetype = "dotted") +
geom_text(
data = intersection_lines,
aes(x = 1.01, y = y_cross, label = paste0(round(y_cross, 1), "%")),
inherit.aes = FALSE,
color = "blue",
size = 3,
hjust = 0
) +
scale_color_manual(
values = c("Maize season" = "gold", "Beans season" = "saddlebrown", "Other" = "gray70")
) +
scale_x_continuous(limits = c(-2, 2.5)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap(~ IQF_environment, scales = "free_y") +
labs(
x = "Relative Profit (CA / CONV)",
y = "Cumulative % of Observations",
title = "Cumulative Frequency of Relative Profit at Block Level",
subtitle = "Dashed = 1 profit threshold; Dotted = intersect with cumulative curve",
color = "Season Crop Type"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
strip.text = element_text(face = "bold"),
legend.position = "right"
)
## Warning: Removed 119 rows containing missing values or values outside the scale range
## (`geom_point()`).
## What is the relation among performance in Yield and Performance in
Profit separating by crop?
library(dplyr)
library(ggplot2)
# 1. Prepare filtered data
regression_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
is.finite(Profit_ratio),
Profit_ratio > -5, Profit_ratio < 5,
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
)
# 2. Create plot with regression line and 1:1 reference line
ggplot(regression_data, aes(x = Y_ratio, y = Profit_ratio)) +
geom_point(alpha = 0.4, color = "gray40", size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "blue", linewidth = 0.8) +
facet_wrap(~crop_type) +
labs(
title = "Relationship Between Yield Ratio and Profit Ratio by Crop",
x = "Relative Yield (Y_ratio: CA / CONV)",
y = "Relative Profit (Profit_ratio: CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
#How does the performance of CA is affected by the environmental index (yield of the control?) ## First we construct an environment index This is the relative yield of each of the treatments in a block as compared to all the blocks of that treatment in the same season and environment. Theh we average the relative yield of all the treatments in a block for a season and we have the environmental indeal of it expresed as a ratio where if > 1 then the environment as expresed by the crops performance was good Notes: As a whole, the relation is weak, however, yield performance of CA (CA:CONV ratio) start to see more very bad scores for relatively poor environments (EI<1)
library(dplyr)
# Step 1: Make sure relevant columns exist
df <- ca_AC_clean %>%
filter(!is.na(DC_Yield_t_ha), !is.na(Treat_code), !is.na(block_season), !is.na(IQR_Season), !is.na(IQF_environment))
# Step 2: Compute average yield per Treat_code within IQR_Season + IQF_environment
ref_yields <- df %>%
group_by(Treat_code, IQR_Season, IQF_environment) %>%
summarise(avg_env_yield = mean(DC_Yield_t_ha, na.rm = TRUE), .groups = "drop")
# Step 3: Join reference yields back to main data
df_with_ref <- df %>%
left_join(ref_yields, by = c("Treat_code", "IQR_Season", "IQF_environment")) %>%
mutate(
yield_ratio_env = DC_Yield_t_ha / avg_env_yield
)
# Step 4: Compute average of those ratios per block-season
env_index_df <- df_with_ref %>%
group_by(block_season) %>%
summarise(env_index = mean(yield_ratio_env, na.rm = TRUE), .groups = "drop")
# Step 5: View result
head(env_index_df)
## # A tibble: 6 Ă— 2
## block_season env_index
## <chr> <dbl>
## 1 102_10_23A 0.721
## 2 102_10_23B 0.816
## 3 102_10_24A 1.12
## 4 102_10_24B 0.862
## 5 102_10_25A 1.26
## 6 102_10_25B 1.18
# Then we paste it into CONV_CA_Ratio
# Merge environment index into CONV_CA_Ratio by block_season
CONV_CA_Ratio <- CONV_CA_Ratio %>%
left_join(env_index_df, by = "block_season")
# View result
head(CONV_CA_Ratio)
## # A tibble: 6 Ă— 105
## IQR_Season Seas_block_treat IQR_SeasAB `Sample code` IQF_agzone IQF_region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 23A 23A79_15C A 23A_i_P2_79_15_T Mayaga-Bug… S
## 2 23A 23A58_10C A 23A_i_P2_58_10_T Central Pl… NW
## 3 23A 23A53_4C A 23A_i_P2_53_4_T Volcanic c… NW
## 4 23A 23A38_15C A 23A_i_P2_38_15_T Buberuka h… NE
## 5 23A 23A9_14C A 23A_i_P2_9_14_T Eastern Ri… E
## 6 23A 23A32_3C A 23A_i_P2_32_3_T Eastern Sa… NE
## # ℹ 99 more variables: IQF_District <chr>, IQF_environment <chr>,
## # IQR_TF_tubura_client <dbl>, ICR_age_hh_head <dbl>, IQR_sex_HH_head <chr>,
## # IQR_TF_role <chr>, ICR_HH_size <dbl>, ICR_arable_land_owned <dbl>,
## # IQR_block <chr>, Farm_ID <chr>, IQR_source_income <chr>,
## # IQR_TF_role_agrdecision <chr>, ICF_Lat <dbl>, ICF_Long <dbl>,
## # ICF_Alt <dbl>, IQR_plot_fert_quality <chr>, IQF_SeasonblockID <chr>,
## # IQR_soil_texture <chr>, IQR_soil_color <chr>, …
library(dplyr)
library(ggplot2)
library(ggpmisc) # for equation and p-value annotation
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Registered S3 method overwritten by 'ggpmisc':
## method from
## as.character.polynomial polynom
# 1. Prepare data (only Yield Ratio, up to 2.5)
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Plot with regression + equation
ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "steelblue") +
stat_poly_eq(
aes(label = paste(..eq.label.., ..p.value.label.., sep = "~~~")),
formula = y ~ x,
parse = TRUE,
label.x = "left", label.y = "top",
size = 3.5
) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Relationship Between Environment Index and Yield Ratio (CA / CONV)",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
## Warning: The dot-dot notation (`..eq.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(eq.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Segmented linear model
# Load required packages
library(dplyr)
library(ggplot2)
library(segmented)
## Warning: package 'segmented' was built under R version 4.3.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
library(glue)
## Warning: package 'glue' was built under R version 4.3.3
library(purrr)
# 1. Prepare the data
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Fit piecewise regression per crop, extract slopes and p-values safely
segmented_results <- plot_data %>%
group_split(crop) %>%
map_df(function(df) {
crop_name <- unique(df$crop)
# Fit linear model
lm_fit <- lm(Y_ratio ~ env_index, data = df)
# Try segmented model
seg_fit <- tryCatch(
segmented(lm_fit, seg.Z = ~ env_index, psi = median(df$env_index)),
error = function(e) NULL
)
if (!is.null(seg_fit)) {
df$fit <- fitted(seg_fit)
df$breakpoint <- seg_fit$psi[2]
# Try to extract slopes
slopes_df <- tryCatch(slope(seg_fit)$env_index, error = function(e) NULL)
if (!is.null(slopes_df) && nrow(slopes_df) >= 2) {
slope1 <- slopes_df[1, "Est."]
slope2 <- slopes_df[2, "Est."]
# Check if "p" column exists before accessing
if ("p" %in% colnames(slopes_df)) {
pval1 <- slopes_df[1, "p"]
pval2 <- slopes_df[2, "p"]
df$slope_label <- glue("Slope1 = {round(slope1, 3)} (p = {format.pval(pval1, digits = 2)})\n",
"Slope2 = {round(slope2, 3)} (p = {format.pval(pval2, digits = 2)})")
} else {
df$slope_label <- glue("Slope1 = {round(slope1, 3)}\nSlope2 = {round(slope2, 3)}")
}
} else {
df$slope_label <- "Slopes not available"
}
} else {
df$fit <- NA
df$breakpoint <- NA
df$slope_label <- "Model failed"
}
df
})
# 3. Plot with segmented line and slope annotations
ggplot(segmented_results, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
geom_line(aes(y = fit), color = "darkred", linewidth = 1) +
geom_text(
data = segmented_results %>%
group_by(crop) %>%
slice(1), # only show one label per facet
aes(x = -Inf, y = Inf, label = slope_label),
hjust = -0.05, vjust = 1.1,
inherit.aes = FALSE,
size = 3.5,
fontface = "italic"
) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Piecewise Linear Relationship Between Env Index and Yield Ratio",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
##Polinomial?
library(dplyr)
library(ggplot2)
library(broom)
library(glue)
# 1. Prepare data
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Y_ratio),
is.finite(env_index),
Y_ratio <= 2.5
)
# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
group_by(crop) %>%
group_map(~ {
df <- .x
crop_label <- .y$crop
models <- list(
Linear = lm(Y_ratio ~ env_index, data = df),
Quadratic = lm(Y_ratio ~ poly(env_index, 2), data = df),
Cubic = lm(Y_ratio ~ poly(env_index, 3), data = df),
LOESS = loess(Y_ratio ~ env_index, data = df)
)
r2_vals <- sapply(models, function(m) {
s <- tryCatch(summary(m), error = function(e) NULL)
if (!is.null(s$r.squared)) s$r.squared else NA_real_
})
tibble(
crop = crop_label,
model = names(models),
r2 = as.numeric(r2_vals)
)
}) %>%
bind_rows() %>%
mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))
# 3. Create the base plot
p <- ggplot(plot_data, aes(x = env_index, y = Y_ratio)) +
geom_point(alpha = 0.4, size = 2) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(0, 2.5)) +
labs(
title = "Model Shape Comparison: Yield Ratio vs Environment Index",
x = "Environment Index",
y = "Yield Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
# 4. Add all smooth lines
p <- p +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")
# 5. Position for R² text
r2_data <- r2_data %>%
group_by(crop) %>%
mutate(
y = 2.5 - 0.2 * (row_number() - 1),
x = min(plot_data$env_index, na.rm = TRUE) + 0.05
)
# 6. Add R² labels to plot
p + geom_text(
data = r2_data,
aes(x = x, y = y, label = r2_label),
inherit.aes = FALSE,
hjust = 0,
size = 3.5,
fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
##Now with profit
library(dplyr)
library(ggplot2)
library(broom)
library(glue)
# 1. Prepare data (limit extreme profit ratios)
plot_data <- CONV_CA_Ratio %>%
filter(
crop %in% c("Maize", "Beans"),
is.finite(Profit_ratio),
is.finite(env_index),
Profit_ratio <= 5, # optional: limit outliers
Profit_ratio >= -5
)
# 2. Compute R² for each model type and crop
r2_data <- plot_data %>%
group_by(crop) %>%
group_map(~ {
df <- .x
crop_label <- .y$crop
models <- list(
Linear = lm(Profit_ratio ~ env_index, data = df),
Quadratic = lm(Profit_ratio ~ poly(env_index, 2), data = df),
Cubic = lm(Profit_ratio ~ poly(env_index, 3), data = df),
LOESS = loess(Profit_ratio ~ env_index, data = df)
)
r2_vals <- sapply(models, function(m) {
s <- tryCatch(summary(m), error = function(e) NULL)
if (!is.null(s$r.squared)) s$r.squared else NA_real_
})
tibble(
crop = crop_label,
model = names(models),
r2 = as.numeric(r2_vals)
)
}) %>%
bind_rows() %>%
mutate(r2_label = glue("{model}: R² = {round(r2, 3)}"))
# 3. Create base plot
p <- ggplot(plot_data, aes(x = env_index, y = Profit_ratio)) +
geom_point(alpha = 0.4, size = 2) +
facet_wrap(~ crop) +
scale_y_continuous(limits = c(-5, 5)) +
labs(
title = "Model Shape Comparison: Profit Ratio vs Environment Index",
x = "Environment Index",
y = "Profit Ratio (CA / CONV)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
# 4. Add all smooth trendlines
p <- p +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 1, linetype = "solid") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red", linewidth = 1, linetype = "dashed") +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "darkgreen", linewidth = 1, linetype = "dotdash") +
geom_smooth(method = "loess", se = FALSE, color = "purple", linewidth = 1, linetype = "twodash")
# 5. Position R² labels
r2_data <- r2_data %>%
group_by(crop) %>%
mutate(
y = 5 - 0.4 * (row_number() - 1),
x = min(plot_data$env_index, na.rm = TRUE) + 0.05
)
# 6. Add R² labels
p + geom_text(
data = r2_data,
aes(x = x, y = y, label = r2_label),
inherit.aes = FALSE,
hjust = 0,
size = 3.5,
fontface = "italic"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
NOtes: NO, it seems that there is actually not a general trend for yield imperovement (for each crops separately) along the 3 years. However, there is yes plots where there is a positive trend; more for beans than for maize. This may be related to the fact that initially the cover crop did lots of competition with the beans and was corrected a little each season to reduce it..???
library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)
# -----------------------------
# 1. Prepare data
# -----------------------------
trend_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
year %in% c("23", "24", "25"),
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
year = as.numeric(year),
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
)
# -----------------------------
# 2. Function to compute slope counts
# -----------------------------
get_slope_label <- function(df) {
# One slope per block
slopes <- df %>%
group_by(IQR_block) %>%
filter(n_distinct(year) >= 2) %>%
summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
mutate(
direction = case_when(
slope > 0 ~ "↑ Positive",
slope < 0 ~ "↓ Negative",
TRUE ~ "= Flat"
)
) %>%
count(direction) %>%
complete(direction, fill = list(n = 0))
# Format label text
paste0(
"↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
"↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
"= Flat: ", slopes$n[slopes$direction == "= Flat"]
)
}
# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))
# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)
# -----------------------------
# 3. MAIZE plot
# -----------------------------
p_maize <- ggplot(
trend_data %>% filter(crop_type == "Maize"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "steelblue",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_maize,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Block-level and Overall Linear Trends of Y_ratio (Maize)",
x = "Year",
y = "Relative Yield (Y_ratio)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_maize,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 4. BEANS plot
# -----------------------------
p_beans <- ggplot(
trend_data %>% filter(crop_type == "Beans"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "darkorange",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_beans,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Block-level and Overall Linear Trends of Y_ratio (Beans)",
x = "Year",
y = "Relative Yield (Y_ratio)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Warning in annotate("label", x = 23.05, y = y_max * 0.95, label = label_beans,
## : Ignoring unknown parameters: `label.size`
# -----------------------------
# 5. Print plots
# -----------------------------
print(p_maize)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
print(p_beans)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## For a farm, the trend for maize is same as for beans? NOte: There is
not a clear relation among the trent in yield for maize and beans (that
is the trend of yield along seasons is almost independent among crops
for the same farm)
library(dplyr)
library(ggplot2)
library(tidyr)
# STEP 1: Calculate slopes per crop per block
slope_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
year %in% c("23", "24", "25"),
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
year = as.numeric(year),
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
) %>%
group_by(IQR_block, crop_type) %>%
filter(n_distinct(year) >= 2) %>%
summarise(
slope = coef(lm(Y_ratio ~ year))[2],
.groups = "drop"
) %>%
pivot_wider(
names_from = crop_type,
values_from = slope
) %>%
filter(!is.na(Maize) & !is.na(Beans))
# STEP 2: Classify quadrants
slope_data <- slope_data %>%
mutate(
quadrant = case_when(
Maize > 0 & Beans > 0 ~ "Q1: ↑ Maize & ↑ Beans",
Maize < 0 & Beans > 0 ~ "Q2: ↓ Maize & ↑ Beans",
Maize < 0 & Beans < 0 ~ "Q3: ↓ Maize & ↓ Beans",
Maize > 0 & Beans < 0 ~ "Q4: ↑ Maize & ↓ Beans",
TRUE ~ "Other"
)
)
# STEP 3: Count blocks per quadrant
quad_counts <- slope_data %>%
count(quadrant) %>%
filter(quadrant != "Other")
# STEP 4: Coordinates for text labels per quadrant
label_positions <- tibble::tibble(
quadrant = c("Q1: ↑ Maize & ↑ Beans", "Q2: ↓ Maize & ↑ Beans",
"Q3: ↓ Maize & ↓ Beans", "Q4: ↑ Maize & ↓ Beans"),
x = c( max(slope_data$Maize, na.rm = TRUE) * 0.85,
min(slope_data$Maize, na.rm = TRUE) * 0.85,
min(slope_data$Maize, na.rm = TRUE) * 0.85,
max(slope_data$Maize, na.rm = TRUE) * 0.85),
y = c( max(slope_data$Beans, na.rm = TRUE) * 0.85,
max(slope_data$Beans, na.rm = TRUE) * 0.85,
min(slope_data$Beans, na.rm = TRUE) * 0.85,
min(slope_data$Beans, na.rm = TRUE) * 0.85)
) %>%
left_join(quad_counts, by = "quadrant") %>%
mutate(label = paste0(quadrant, "\nN = ", n))
# STEP 5: Final plot
ggplot(slope_data, aes(x = Maize, y = Beans)) +
geom_point(alpha = 0.7, size = 2.5, color = "purple") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted") +
geom_text(data = label_positions,
aes(x = x, y = y, label = label),
size = 4.2, hjust = 0.5, vjust = 0.5, fontface = "bold") +
labs(
title = "Comparison of Yield Trends (Slopes) per Block",
subtitle = "Each point is one IQR_block — comparing Maize vs Beans trend slopes",
x = "Maize trend slope",
y = "Beans trend slope"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)
Note: Effectes of the 3 factors are significant and also the interactions. So the CA vs. CONV comparison needs to be opened and evaluated by season-environment combination
fit = lmer(DC_Yield_t_ha~IQR_Season + IQF_environment + Treat_code +
IQR_Season:IQF_environment + IQF_environment:Treat_code + IQR_Season:Treat_code+
IQR_Season:IQF_environment:Treat_code+
(1|IQR_block),
data = ca_AC_clean)
summary(fit)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## DC_Yield_t_ha ~ IQR_Season + IQF_environment + Treat_code + IQR_Season:IQF_environment +
## IQF_environment:Treat_code + IQR_Season:Treat_code + IQR_Season:IQF_environment:Treat_code +
## (1 | IQR_block)
## Data: ca_AC_clean
##
## REML criterion at convergence: 18145.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1423 -0.5809 -0.0357 0.5168 3.8186
##
## Random effects:
## Groups Name Variance Std.Dev.
## IQR_block (Intercept) 0.7352 0.8574
## Residual 1.1814 1.0869
## Number of obs: 5654, groups: IQR_block, 567
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 5.07265 0.11767
## IQR_Season23B -3.89347 0.14310
## IQR_Season24A -0.57716 0.14608
## IQR_Season24B -3.43020 0.14315
## IQR_Season25A -1.31273 0.30002
## IQR_Season25B -3.65663 0.14737
## IQF_environmentMid_Low_Dry 0.50789 0.14817
## IQF_environmentMid_Low_Wet -0.57116 0.15895
## IQF_environmentTransition -0.33084 0.17251
## Treat_codeC -0.58904 0.13926
## IQR_Season23B:IQF_environmentMid_Low_Dry 0.31837 0.18685
## IQR_Season24A:IQF_environmentMid_Low_Dry 0.92759 0.19113
## IQR_Season24B:IQF_environmentMid_Low_Dry -0.41498 0.18889
## IQR_Season25A:IQF_environmentMid_Low_Dry -0.28488 0.32428
## IQR_Season25B:IQF_environmentMid_Low_Dry 0.15796 0.19279
## IQR_Season23B:IQF_environmentMid_Low_Wet 0.39614 0.19425
## IQR_Season24A:IQF_environmentMid_Low_Wet 0.28734 0.19991
## IQR_Season24B:IQF_environmentMid_Low_Wet -0.02848 0.19666
## IQR_Season25A:IQF_environmentMid_Low_Wet 1.18206 0.33027
## IQR_Season25B:IQF_environmentMid_Low_Wet 0.26227 0.20164
## IQR_Season23B:IQF_environmentTransition 0.75971 0.21271
## IQR_Season24A:IQF_environmentTransition 0.34742 0.21956
## IQR_Season24B:IQF_environmentTransition 0.67783 0.21652
## IQR_Season25A:IQF_environmentTransition 1.17576 0.33871
## IQR_Season25B:IQF_environmentTransition 0.75964 0.22011
## IQF_environmentMid_Low_Dry:Treat_codeC -0.12676 0.18153
## IQF_environmentMid_Low_Wet:Treat_codeC 0.02037 0.18881
## IQF_environmentTransition:Treat_codeC 0.10068 0.20647
## IQR_Season23B:Treat_codeC 0.37611 0.20078
## IQR_Season24A:Treat_codeC -1.23510 0.20469
## IQR_Season24B:Treat_codeC -0.06783 0.20058
## IQR_Season25A:Treat_codeC -0.15221 0.40874
## IQR_Season25B:Treat_codeC 0.29254 0.20575
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC -0.24892 0.26193
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 1.01731 0.26749
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.35035 0.26445
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC -0.44379 0.43993
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.04390 0.26910
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC -0.07430 0.27266
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 1.00901 0.28010
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.19780 0.27563
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC -0.16033 0.44962
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.06295 0.28196
## IQR_Season23B:IQF_environmentTransition:Treat_codeC -0.19715 0.29854
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 0.75924 0.30740
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 0.04022 0.30248
## IQR_Season25A:IQF_environmentTransition:Treat_codeC -0.27436 0.46167
## IQR_Season25B:IQF_environmentTransition:Treat_codeC -0.17918 0.30722
## df t value
## (Intercept) 3946.84230 43.108
## IQR_Season23B 5007.77869 -27.209
## IQR_Season24A 5014.15153 -3.951
## IQR_Season24B 5029.94067 -23.963
## IQR_Season25A 5095.20364 -4.375
## IQR_Season25B 5036.77759 -24.812
## IQF_environmentMid_Low_Dry 5265.71529 3.428
## IQF_environmentMid_Low_Wet 4244.04672 -3.593
## IQF_environmentTransition 4466.82592 -1.918
## Treat_codeC 4934.68676 -4.230
## IQR_Season23B:IQF_environmentMid_Low_Dry 5024.75069 1.704
## IQR_Season24A:IQF_environmentMid_Low_Dry 5028.40349 4.853
## IQR_Season24B:IQF_environmentMid_Low_Dry 5038.70327 -2.197
## IQR_Season25A:IQF_environmentMid_Low_Dry 5152.64948 -0.879
## IQR_Season25B:IQF_environmentMid_Low_Dry 5044.03679 0.819
## IQR_Season23B:IQF_environmentMid_Low_Wet 5004.70035 2.039
## IQR_Season24A:IQF_environmentMid_Low_Wet 5014.82054 1.437
## IQR_Season24B:IQF_environmentMid_Low_Wet 5023.55760 -0.145
## IQR_Season25A:IQF_environmentMid_Low_Wet 5151.71514 3.579
## IQR_Season25B:IQF_environmentMid_Low_Wet 5029.47221 1.301
## IQR_Season23B:IQF_environmentTransition 5024.22535 3.572
## IQR_Season24A:IQF_environmentTransition 5036.14781 1.582
## IQR_Season24B:IQF_environmentTransition 5048.34564 3.131
## IQR_Season25A:IQF_environmentTransition 5140.32873 3.471
## IQR_Season25B:IQF_environmentTransition 5049.16391 3.451
## IQF_environmentMid_Low_Dry:Treat_codeC 4934.73832 -0.698
## IQF_environmentMid_Low_Wet:Treat_codeC 4934.43099 0.108
## IQF_environmentTransition:Treat_codeC 4936.59168 0.488
## IQR_Season23B:Treat_codeC 4934.54897 1.873
## IQR_Season24A:Treat_codeC 4933.26541 -6.034
## IQR_Season24B:Treat_codeC 4934.27215 -0.338
## IQR_Season25A:Treat_codeC 4932.21882 -0.372
## IQR_Season25B:Treat_codeC 4933.17413 1.422
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 4934.36696 -0.950
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 4933.62520 3.803
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.92198 1.325
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 4932.37895 -1.009
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 4933.18903 0.163
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.13254 -0.272
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 4933.57546 3.602
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 4934.53452 0.718
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 4932.40823 -0.357
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 4933.03230 0.223
## IQR_Season23B:IQF_environmentTransition:Treat_codeC 4936.75531 -0.660
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 4934.11571 2.470
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 4935.72560 0.133
## IQR_Season25A:IQF_environmentTransition:Treat_codeC 4932.90235 -0.594
## IQR_Season25B:IQF_environmentTransition:Treat_codeC 4934.01727 -0.583
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## IQR_Season23B < 2e-16 ***
## IQR_Season24A 7.89e-05 ***
## IQR_Season24B < 2e-16 ***
## IQR_Season25A 1.24e-05 ***
## IQR_Season25B < 2e-16 ***
## IQF_environmentMid_Low_Dry 0.000613 ***
## IQF_environmentMid_Low_Wet 0.000330 ***
## IQF_environmentTransition 0.055199 .
## Treat_codeC 2.38e-05 ***
## IQR_Season23B:IQF_environmentMid_Low_Dry 0.088461 .
## IQR_Season24A:IQF_environmentMid_Low_Dry 1.25e-06 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry 0.028071 *
## IQR_Season25A:IQF_environmentMid_Low_Dry 0.379705
## IQR_Season25B:IQF_environmentMid_Low_Dry 0.412614
## IQR_Season23B:IQF_environmentMid_Low_Wet 0.041471 *
## IQR_Season24A:IQF_environmentMid_Low_Wet 0.150693
## IQR_Season24B:IQF_environmentMid_Low_Wet 0.884848
## IQR_Season25A:IQF_environmentMid_Low_Wet 0.000348 ***
## IQR_Season25B:IQF_environmentMid_Low_Wet 0.193442
## IQR_Season23B:IQF_environmentTransition 0.000358 ***
## IQR_Season24A:IQF_environmentTransition 0.113631
## IQR_Season24B:IQF_environmentTransition 0.001754 **
## IQR_Season25A:IQF_environmentTransition 0.000522 ***
## IQR_Season25B:IQF_environmentTransition 0.000563 ***
## IQF_environmentMid_Low_Dry:Treat_codeC 0.485007
## IQF_environmentMid_Low_Wet:Treat_codeC 0.914091
## IQF_environmentTransition:Treat_codeC 0.625834
## IQR_Season23B:Treat_codeC 0.061097 .
## IQR_Season24A:Treat_codeC 1.72e-09 ***
## IQR_Season24B:Treat_codeC 0.735259
## IQR_Season25A:Treat_codeC 0.709621
## IQR_Season25B:Treat_codeC 0.155149
## IQR_Season23B:IQF_environmentMid_Low_Dry:Treat_codeC 0.341986
## IQR_Season24A:IQF_environmentMid_Low_Dry:Treat_codeC 0.000145 ***
## IQR_Season24B:IQF_environmentMid_Low_Dry:Treat_codeC 0.185282
## IQR_Season25A:IQF_environmentMid_Low_Dry:Treat_codeC 0.313132
## IQR_Season25B:IQF_environmentMid_Low_Dry:Treat_codeC 0.870420
## IQR_Season23B:IQF_environmentMid_Low_Wet:Treat_codeC 0.785259
## IQR_Season24A:IQF_environmentMid_Low_Wet:Treat_codeC 0.000319 ***
## IQR_Season24B:IQF_environmentMid_Low_Wet:Treat_codeC 0.473017
## IQR_Season25A:IQF_environmentMid_Low_Wet:Treat_codeC 0.721415
## IQR_Season25B:IQF_environmentMid_Low_Wet:Treat_codeC 0.823344
## IQR_Season23B:IQF_environmentTransition:Treat_codeC 0.509028
## IQR_Season24A:IQF_environmentTransition:Treat_codeC 0.013548 *
## IQR_Season24B:IQF_environmentTransition:Treat_codeC 0.894235
## IQR_Season25A:IQF_environmentTransition:Treat_codeC 0.552351
## IQR_Season25B:IQF_environmentTransition:Treat_codeC 0.559773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 48 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
anova(fit)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value
## IQR_Season 10906.4 2181.29 5 5090.1 1846.3812
## IQF_environment 144.2 48.06 3 1584.7 40.6770
## Treat_code 456.2 456.21 1 4932.1 386.1684
## IQR_Season:IQF_environment 572.7 38.18 15 5134.1 32.3174
## IQF_environment:Treat_code 10.0 3.32 3 4935.8 2.8144
## IQR_Season:Treat_code 106.8 21.36 5 4932.0 18.0778
## IQR_Season:IQF_environment:Treat_code 42.0 2.80 15 4933.6 2.3681
## Pr(>F)
## IQR_Season < 2.2e-16 ***
## IQF_environment < 2.2e-16 ***
## Treat_code < 2.2e-16 ***
## IQR_Season:IQF_environment < 2.2e-16 ***
## IQF_environment:Treat_code 0.037797 *
## IQR_Season:Treat_code < 2.2e-16 ***
## IQR_Season:IQF_environment:Treat_code 0.002128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load required libraries
library(dplyr)
library(ggplot2)
# 1. Filter and prepare main data
plot_data <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio), Y_ratio <= 5)
# 2. Compute p-values, significance, and means
sig_labels <- plot_data %>%
group_by(IQR_Season, IQF_environment, crop) %>%
summarise(
mean_y = mean(Y_ratio, na.rm = TRUE),
p_value = tryCatch(t.test(Y_ratio, mu = 1)$p.value, error = function(e) NA_real_),
.groups = "drop"
) %>%
mutate(
signif = case_when(
is.na(p_value) ~ "",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
),
mean_label = round(mean_y, 2) # Rounded mean
)
# 3. Get max Y for each group to place stars
y_pos <- plot_data %>%
group_by(IQR_Season, IQF_environment, crop) %>%
summarise(y_max = max(Y_ratio, na.rm = TRUE), .groups = "drop")
# 4. Merge label + position
sig_labels <- left_join(sig_labels, y_pos, by = c("IQR_Season", "IQF_environment", "crop"))
# 5. Plot
ggplot(plot_data, aes(x = IQR_Season, y = Y_ratio, color = crop)) +
geom_boxplot(outlier.alpha = 0.3) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
facet_wrap(~ IQF_environment) +
# Add significance stars
geom_text(
data = sig_labels,
aes(x = IQR_Season, y = y_max + 0.15, label = signif, group = crop),
inherit.aes = FALSE,
size = 5,
fontface = "bold"
) +
# Add mean values just above the stars
geom_text(
data = sig_labels,
aes(x = IQR_Season, y = y_max + 0.7, label = mean_label, group = crop),
inherit.aes = FALSE,
size = 3.5,
fontface = "plain",
color = "black"
) +
labs(
title = "Yield Ratio (CA / CONV) by Season and Environment",
x = "Season",
y = "Yield Ratio"
) +
ylim(0, 5.7) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold")
)
### Distribution of the residuals from mix model Although Shapiro–Wilk
tests indicated statistically significant deviations from normality (p
< 0.001), that my be because of the large sample size; W=0.99 the
residuals are close to normality and Q–Q plots showed no substantial
departures from normality, indicating that model assumptions were
adequately met.
library(broom.mixed)
library(dplyr)
resid_df <- augment(fit) %>%
mutate(
resid = .resid,
fitted = .fitted
)
library(ggplot2)
ggplot(resid_df, aes(x = resid)) +
geom_histogram(aes(y = after_stat(density)),
bins = 40,
fill = "grey80",
color = "black") +
geom_density(color = "blue", linewidth = 1) +
labs(
title = "Distribution of residuals",
x = "Residuals",
y = "Density"
) +
theme_minimal()
ggplot(resid_df, aes(sample = resid)) +
stat_qq(color = "black") +
stat_qq_line(color = "red", linewidth = 1) +
labs(
title = "Q–Q plot of residuals"
) +
theme_minimal()
ggplot(resid_df, aes(x = fitted, y = resid)) +
geom_point(alpha = 0.4) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs fitted values",
x = "Fitted values",
y = "Residuals"
) +
theme_minimal()
### Test of residuals H0 rejected but because we have large N, the W is
actually very close to 1…so its quite normal
set.seed(123)
shapiro.test(sample(resid_df$resid, 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(resid_df$resid, 5000)
## W = 0.98611, p-value < 2.2e-16
Mean is 0 and symetric distribution of the residual of the random effects
qqnorm(ranef(fit)$IQR_block[[1]])
qqline(ranef(fit)$IQR_block[[1]], col = "red")
To describe how farms perform under CA over time, we created a typology of block-level responses using their relative yield (CA / CONV) across six seasons (23A–25B). Only blocks with data in at least four seasons were retained to ensure stable classification.
For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85
These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance.
Notes: Average performance of CA is very bad, even if we target the environmetns or the districts with best performance, this blanket reccomednation would fail baddly. However,if we would be able to target the plots with good performance.. the rate of success of the reccomendatio could be high…for aprox 20% of the plots (i.e., 35% of the farmers…)
library(dplyr)
library(tidyr)
# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")
# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
group_by(IQR_block) %>%
summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
filter(n_valid_seasons >= 4)
# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
IQR_Season %in% seasons,
is.finite(Y_ratio))
# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
group_by(IQR_block, IQR_Season) %>%
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = IQR_Season,
values_from = Y_ratio,
names_prefix = "Y_ratio_"
)
# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
group_by(IQR_block) %>%
summarise(
n_seasons = n_distinct(IQR_Season),
any_below_0.95 = any(Y_ratio < 0.95),
mean_ratio = mean(Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
CA_typology = case_when(
!any_below_0.95 ~ "Good",
any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
mean_ratio <= 0.85 ~ "Very Poor"
)
)
# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>% # Prioritize 23A > 24A > 25A
distinct(IQR_block, .keep_all = TRUE) %>%
dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)
# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
left_join(RY_wide, by = "IQR_block") %>%
left_join(block_meta, by = "IQR_block")
Now we analyse the frequency of each type
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_environment) +
labs(
title = "CA-Performance Typology Distribution by OAF-AEZ",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
## What if we target the districts with the best performnace? wht could
be the rate of success? Notes: Yes better chance but it would still fail
in all
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_District) +
labs(
title = "CA-Performance Typology Distribution by District",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
## Another way to see it
library(dplyr)
library(ggplot2)
library(glue)
# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio)) %>%
group_by(IQR_block, IQF_District) %>%
summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 2. Summary for annotations
district_summary <- block_means %>%
group_by(IQF_District) %>%
summarise(
avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
n_total = n(),
n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
percent_ge1 = round(100 * n_ge1 / n_total)
) %>%
mutate(
label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
y_pos = 1.5
)
# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "gray90") +
geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
geom_text(
data = district_summary,
aes(x = IQF_District, y = y_pos, label = label),
inherit.aes = FALSE,
size = 3.5,
vjust = 0
) +
labs(
title = "Distribution of Mean Y_ratio per Block by District",
x = "District",
y = "Mean Y_ratio per Block (across seasons)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
# Drivers of yield performance We start with the database of yield
drivers that was already processed in the scrips: “data_mm_block” but we
change the relative
data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")
## Rows: 473 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_te...
## dbl (8): ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, ICR_arable_land_avg...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drivers_data <- data_mm_block %>%
mutate(
CA_typology = fct_drop(CA_typology),
IQR_block = factor(IQR_block),
IQR_TF_tubura_client = factor(IQR_TF_tubura_client),
IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
IQR_soil_texture = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
IQR_soil_color = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
IQF_position_slope = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
)
str(drivers_data)
## tibble [473 Ă— 17] (S3: tbl_df/tbl/data.frame)
## $ IQR_block : Factor w/ 473 levels "10_1","10_10",..: 15 16 17 18 19 20 21 22 23 24 ...
## $ ICR_Org_C_avg : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
## $ ICR_K_perc_23A : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
## $ ICR_Avail_P_avg : num [1:473] 11.7 11.5 22 60.6 35.6 ...
## $ ICR_arable_land_avg : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
## $ ICF_Alt_avg : num [1:473] 1636 1293 1141 1207 1145 ...
## $ ICR_rainfall_avg : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
## $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
## $ ICF_planting_date : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
## $ IQR_TF_tubura_client : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
## $ IQR_plot_fert_quality : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
## $ IQR_soil_texture : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ IQR_soil_color : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
## $ IQF_position_slope : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
## $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ CA_typology : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
## $ IQF_environment : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...
# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)
# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
dplyr::group_by(IQR_block, IQR_Season) |>
dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
dplyr::group_by(IQR_block) |>
dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |>
dplyr::left_join(avg_Y_ratio, by = "IQR_block")
library(dplyr)
# Merge CA_typology from Perf_Type using IQR_block as the key
drivers_data <- drivers_data |>
left_join(Perf_Type |>
dplyr::select(IQR_block, CA_typology),
by = "IQR_block")
#Explore, visually the protential drivers and the relatio nwith Y_ration of the blocks ## Cathegoric variables Variables we have so far (after filtering some out and re-grouping cathegories with low N: ICR_Org_C_avg num: ICR_K_perc_23A num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICF_Alt_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” Output variables: avg_Y_ratio num: CA_typology factor:
Notes: when we look at all the database pooled together, there is no relation at all among SOC and performance of the CA. Howeever, where we split the analysis be Environment, then an interation becomes clear, where there is no relation in mid altitude and the relation is quite significant and positive in Highland and sligtly negative in transition. Conclusion. SOC should remain for the drivers analysis
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_Org_C_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_Org_C_avg vs. avg_Y_ratio",
x = "Organic Carbon (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### What if we facet by environment? to see if interactions are masking
the effect
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions (adjust if needed)
#----------------------------------------------
# Choose fixed position for annotation
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, lm line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Org_C_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Organic Carbon (%)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Fit ANOVA model
model_anova <- aov(ICR_Org_C_avg ~ CA_typology, data = drivers_data)
# 2. Tukey HSD post-hoc test
tukey <- TukeyHSD(model_anova)
# 3. Extract group letters
letters_df <- multcompLetters(tukey$CA_typology[,"p adj"])$Letters
letters_df <- data.frame(CA_typology = names(letters_df), Letters = letters_df)
# 4. Calculate group means
means_df <- drivers_data %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 5. Combine letters and means
plot_labels <- left_join(letters_df, means_df, by = "CA_typology")
# 6. Create boxplot with letters and means at fixed Y positions
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray", color = "black") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.5) +
geom_text(data = plot_labels, aes(x = CA_typology, y = 7.5, label = Letters),
fontface = "bold", size = 5, color = "darkred") +
geom_text(data = plot_labels, aes(x = CA_typology, y = 7, label = mean_label),
fontface = "plain", size = 4, color = "black") +
labs(
title = "ICR_Org_C_avg by CA_typology",
subtitle = "Letters indicate significant differences (Tukey HSD)",
x = "CA Typology",
y = "Organic Carbon (%)"
) +
theme_minimal()
### What if we facet by Environment? myve some interactions masking the
effect?
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_Org_C_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 9.5, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 8.25, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Org_C_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Organic Carbon (%)"
) +
theme_minimal()
## CA Performance as affected by ICR_K_perc_23A ### with avg_Y_ratio
Notes: there is some minimal relation in highland. When we look at the
correlation among SOC and K, its relatively high in highland. So we
could just use SOC in the drivers analysis and exclude K Conclusion:
Keep SCO but explcude K
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_K_perc_23A), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_K_perc_23A, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_K_perc_23A, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_K_perc_23A vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Potassium (%) in 23A",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
### with CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_K_perc_23A ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_K_perc_23A)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_K_perc_23A)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 3.2, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 2.8, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_K_perc_23A by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Potassium (%) in 23A"
) +
theme_minimal()
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Since SOC semms to have some explanatory power and K very little,
lets see the correlation among them Notes: there is quite a correlation
among K and SOC in highland
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_K_perc_23A), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_K_perc_23A ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_K_perc_23A)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Potassium (23A)",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_K_perc_23A (Potassium %)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## CA Performance as affected by ICR_Avail_P_avg ### with avg_Y_ratio
NOtes: P seems to have some explanatory power in 2 of the 4 environemnts
and the correlation with SOC is not strong. Conclusions: keep it for the
drivers analysis
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_Avail_P_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_Avail_P_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Avail_P_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Avail_P_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Available Phosphorus (ppm)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###ICR_Avail_P_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_Avail_P_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_Avail_P_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Avail_P_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 110, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 100, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Avail_P_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Available Phosphorus (ppm)"
) +
theme_minimal()
###Correlation: ICR_Org_C_avg vs. ICR_Avail_P_avg
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_Avail_P_avg), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_Avail_P_avg ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_Avail_P_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Available Phosphorus",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_Avail_P_avg (Phosphorus ppm)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## CA Performance as affected by ICR_arable_land_avg
NOtes: no correlation when pooled together but a trant in 2 out of 4 environments. Moreover it is a hosehold variable that is ususally used to explain perfomrance of practices Conclusions: Keep it for drivers analysis
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_arable_land_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_arable_land_avg vs. avg_Y_ratio",
x = "Arable Land (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### In facet by environment
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_arable_land_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_arable_land_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_arable_land_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Arable land (%)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###ICR_arable_land_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
get_letters <- function(df) {
mod <- aov(ICR_arable_land_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_arable_land_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_arable_land_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(drivers_data, aes(x = CA_typology, y = ICR_arable_land_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 90, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 82, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_arable_land_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Arable land (%)"
) +
theme_minimal()
###Correlation: ICR_Org_C_avg vs ICR_arable_land_avg
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_arable_land_avg), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_arable_land_avg ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
label_y <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_arable_land_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Arable Land",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_arable_land_avg (Arable land %)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## CA Performance as affected by ICF_Alt_avg
NOtes: there seems to be no correlation when pooled together and even less when split by environment. Environment clasiffication is mainly based on altitude so if we are using em=nvironments as driver, altitude can be eliminated Conclusions: Do not use further in drivers analysis
###All together
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_Alt_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICF_Alt_avg vs. avg_Y_ratio",
x = "Altitude (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICF_Alt_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICF_Alt_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICF_Alt_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_Alt_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values per environment",
x = "Altitude (m)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###ICF_Alt_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
# Function for letters
get_letters <- function(df) {
mod <- aov(ICF_Alt_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
# Run stats by environment
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICF_Alt_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICF_Alt_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 0)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICF_Alt_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 2000, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 1800, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_Alt_avg by CA_typology (per environment)",
subtitle = "Tukey HSD: significant differences by letter",
x = "CA Typology",
y = "Altitude (m)"
) +
theme_minimal()
NOtes: Rain seems to partially explain CA performance, with a cuadratice-model response when pooled all environmetns together, with 4 mm per day as optimumn. However, when split by environments, linear models can fit as good Conclusions: Keep, consider cuadratic relation and not linear if not grouped within environments ###Simple regression: ICR_rainfall_avg vs avg_Y_ratio
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg, data = drivers_data)
# 2. Extract p-value and R-squared from model summary
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5, color = "black") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### What wbout tieh a cuadratic model?
library(ggplot2)
# 1. Fit quadratic model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2),
data = drivers_data)
# 2. Extract p-value and R²
p_val <- summary(model)$coefficients["I(ICR_rainfall_avg^2)", 4]
r2 <- summary(model)$r.squared
# 3. Create plot with p and R²
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(
method = "lm",
formula = y ~ x + I(x^2),
se = TRUE,
color = "darkred",
linetype = "dashed"
) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5)
###Faceted regression: ICR_rainfall_avg vs avg_Y_ratio by environment
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
# 1. Fit separate linear models per environment
stats_df <- drivers_data %>%
filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
# 2. Annotation positions
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_rainfall_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values per environment",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### same as avobe but with cuadratic model
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
# 1. Fit quadratic models per environment
stats_df <- drivers_data %>%
filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2), data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients["I(ICR_rainfall_avg^2)", 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
# IMPORTANT: use R^2 (not R²)
stat_label = paste0(
"p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2)
)
) %>%
dplyr::select(IQF_environment, stat_label)
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(
method = "lm",
formula = y ~ x + I(x^2),
se = TRUE,
color = "darkred",
linetype = "dashed"
) +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
subtitle = "Quadratic regressions per environment with p-values and R^2",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
)
###Boxplot: ICR_rainfall_avg by CA_typology (with Tukey HSD letters)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
# Function to extract Tukey letters
get_letters <- function(df) {
mod <- aov(ICR_rainfall_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
# Run stats by environment
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_rainfall_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_rainfall_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICR_rainfall_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 7, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 6.2, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg by CA_typology (per environment)",
subtitle = "Tukey HSD: significant differences by letter",
x = "CA Typology",
y = "Rainfall (avg, mm)"
) +
theme_minimal()
## CA Performance as affected by Comp_amount_corr_quali
NOtes: It seems to have a significant relation with both Y_relative and typology. Wether pooled all together of per environment, but when split by environment, all show the sam esign of trend, but is clear and significant only for mid-dry and transition Conclusions: Keep for the drivers analysis ###Simple regression: Comp_amount_corr_quali vs avg_Y_ratio
library(ggplot2)
# Filter data
df_filt <- drivers_data %>%
dplyr::filter(!is.na(Comp_amount_corr_quali),
!is.na(avg_Y_ratio),
Comp_amount_corr_quali <= 15)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = df_filt)
# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "Comp_amount_corr_quali vs. avg_Y_ratio",
x = "Compost amount (corrected, ≤15)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###Linear regression per environment (facet + R² + p, compost ≤ 15)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(Comp_amount_corr_quali),
!is.na(avg_Y_ratio),
!is.na(IQF_environment),
Comp_amount_corr_quali <= 15)
#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Comp_amount_corr_quali vs. avg_Y_ratio",
subtitle = "Linear regressions per environment (compost ≤ 15)",
x = "Compost amount (corrected)",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###Boxplot by CA_typology (Tukey HSD, compost ≤ 15)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(CA_typology),
!is.na(Comp_amount_corr_quali),
!is.na(IQF_environment),
Comp_amount_corr_quali <= 15)
get_letters <- function(df) {
mod <- aov(Comp_amount_corr_quali ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(Comp_amount_corr_quali, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(df_filt, aes(x = CA_typology, y = Comp_amount_corr_quali)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95,
label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.85,
label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Comp_amount_corr_quali by CA_typology (per environment)",
subtitle = "Tukey HSD (compost ≤ 15)",
x = "CA Typology",
y = "Compost amount (corrected)"
) +
theme_minimal()
## CA Performance as affected by ICF_planting_date
NOtes: Conclusions: ###Simple regression: ICR_rainfall_avg ICF_planting_date
library(ggplot2)
library(dplyr)
# Filter data
df_filt <- drivers_data %>%
dplyr::filter(!is.na(ICF_planting_date),
!is.na(avg_Y_ratio))
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_planting_date, data = df_filt)
# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICF_planting_date vs. avg_Y_ratio",
x = "Planting date",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###Linear regression per environment (facet + R^2 + p)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(ICF_planting_date),
!is.na(avg_Y_ratio),
!is.na(IQF_environment))
#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICF_planting_date, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date vs. avg_Y_ratio",
subtitle = "Linear regressions per environment",
x = "Planting date",
y = "Average Yield Ratio"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
###Boxplot by CA_typology (Tukey HSD)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(CA_typology),
!is.na(ICF_planting_date),
!is.na(IQF_environment))
get_letters <- function(df) {
mod <- aov(ICF_planting_date ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICF_planting_date, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(means)`.
ggplot(df_filt, aes(x = CA_typology, y = ICF_planting_date)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95,
label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.85,
label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date by CA_typology (per environment)",
subtitle = "Tukey HSD",
x = "CA Typology",
y = "Planting date"
) +
theme_minimal()
###How does delay in planting relate to rain over the season?
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
# 1. Filter valid data
df_filt <- drivers_data %>%
dplyr::filter(
!is.na(ICF_planting_date),
!is.na(ICR_rainfall_avg),
!is.na(IQF_environment)
)
# 2. Fit linear model per environment
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_rainfall_avg ~ ICF_planting_date, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
# 3. Annotation position
label_y <- max(df_filt$ICR_rainfall_avg, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 4. Plot
ggplot(df_filt, aes(x = ICF_planting_date, y = ICR_rainfall_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 4,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date vs. ICR_rainfall_avg",
subtitle = "Linear regression per environment (p-value and R^2 shown)",
x = "Planting date",
y = "Rainfall average (mm)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
##Factorial variables ## CA Performance as affected by
IQR_TF_tubura_client NOtes: Conclusions: ###IQR_TF_tubura_client with
Y_ratio
library(ggplot2)
library(dplyr)
# 1. Filter valid data
df_filt <- drivers_data %>%
dplyr::filter(
!is.na(avg_Y_ratio),
!is.na(IQR_TF_tubura_client),
!IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_TF_tubura_client = droplevels(IQR_TF_tubura_client)) # drop unused levels
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Run Kruskal-Wallis test
kruskal_test <- kruskal.test(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
kruskal_p <- kruskal_test$p.value
# 4. Build annotation label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 5. Calculate group means
mean_df <- df_filt %>%
group_by(IQR_TF_tubura_client) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Plot boxplot with p-values and means
ggplot(df_filt, aes(x = IQR_TF_tubura_client, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Means under each box
geom_text(
data = mean_df,
aes(x = IQR_TF_tubura_client,
y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# P-value annotation
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_TF_tubura_client",
subtitle = "ANOVA and Kruskal–Wallis test results shown",
x = "Tubura client status",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_TF_tubura_client with CA_typology
NOtes: Conclusions: ###IQR_plot_fert_quality with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_plot_fert_quality),
!IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_plot_fert_quality = factor(IQR_plot_fert_quality, levels = c("low_quality", "medium", "high")))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Tukey HSD and Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_plot_fert_quality`[, "p adj"])$Letters
letters_df <- data.frame(
IQR_plot_fert_quality = names(letters),
Letters = letters,
stringsAsFactors = FALSE
)
# 4. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)$p.value
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 5. Group means
means_df <- df_filt %>%
group_by(IQR_plot_fert_quality) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means and letters
labels_df <- left_join(means_df, letters_df, by = "IQR_plot_fert_quality")
# 7. Plot
ggplot(df_filt, aes(x = IQR_plot_fert_quality, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Add Tukey letters above boxes
geom_text(
data = labels_df,
aes(x = IQR_plot_fert_quality, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Add means below boxes
geom_text(
data = labels_df,
aes(x = IQR_plot_fert_quality, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# Add test label
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_plot_fert_quality",
subtitle = "Tukey HSD letters, means, and statistical tests",
x = "Plot Fertility Quality",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_plot_fert_quality with CA_typology
NOtes: Conclusions: ###IQR_soil_texture with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean the data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_soil_texture),
!IQR_soil_texture %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_soil_texture = droplevels(IQR_soil_texture))
# Optional: Reorder texture levels manually (if desired)
# Example order: "sand", "loam", "clay", etc.
df_filt <- df_filt %>%
mutate(IQR_soil_texture = factor(IQR_soil_texture, levels = c("Fine", "Mix", "Coarse")))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)$p.value
# 4. Tukey HSD and compact letter display
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_texture`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_texture = names(letters), Letters = letters)
# 5. Compute group means
means_df <- df_filt %>%
group_by(IQR_soil_texture) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_texture")
# 7. Test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_texture, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above box
geom_text(
data = labels_df,
aes(x = IQR_soil_texture, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below box
geom_text(
data = labels_df,
aes(x = IQR_soil_texture, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# Statistical tests label
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_soil_texture",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Soil Texture",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_soil_texture with CA_typology
NOtes: Conclusions: ###IQR_soil_color with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean the data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_soil_color),
!IQR_soil_color %in% c("no data", "---", "", "#N/A")
) %>%
mutate(IQR_soil_color = droplevels(IQR_soil_color))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal–Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = df_filt)$p.value
# 4. Tukey HSD and letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_color`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_color = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQR_soil_color) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine letters and means
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_color")
# 7. Test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above boxes
geom_text(
data = labels_df,
aes(x = IQR_soil_color, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below boxes
geom_text(
data = labels_df,
aes(x = IQR_soil_color, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_soil_color",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Soil Color",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_soil_color with CA_typology
NOtes: Conclusions: ###IQF_position_slope with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQF_position_slope),
!IQF_position_slope %in% c("no data", "---", "", "NA")
) %>%
mutate(IQF_position_slope = droplevels(IQF_position_slope))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = df_filt)$p.value
# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_position_slope`[, "p adj"])$Letters
letters_df <- data.frame(IQF_position_slope = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQF_position_slope) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQF_position_slope")
# 7. Statistical test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Tukey HSD letters (above)
geom_text(
data = labels_df,
aes(x = IQF_position_slope, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means (below)
geom_text(
data = labels_df,
aes(x = IQF_position_slope, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQF_position_slope",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Slope Position",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQF_position_slope with CA_typology
NOtes: Conclusions: ###Weed_species_A_combined with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(Weed_species_A_combined),
!Weed_species_A_combined %in% c("no data", "-", "—", "NA")
) %>%
mutate(Weed_species_A_combined = droplevels(Weed_species_A_combined))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal–Wallis
kruskal_p <- kruskal.test(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)$p.value
# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`Weed_species_A_combined`[, "p adj"])$Letters
letters_df <- data.frame(Weed_species_A_combined = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(Weed_species_A_combined) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "Weed_species_A_combined")
# 7. Combine test labels
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = Weed_species_A_combined, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above
geom_text(
data = labels_df,
aes(x = Weed_species_A_combined, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below
geom_text(
data = labels_df,
aes(x = Weed_species_A_combined, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by Weed_species_A_combined",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Weed Species (Combined)",
y = "Average Yield Ratio"
) +
theme_minimal()
###Weed_species_A_combined with CA_typology
NOtes: Conclusions: ###IQF_environment with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQF_environment),
!IQF_environment %in% c("no data", "---", "", "NA")
) %>%
mutate(IQF_environment = droplevels(factor(IQF_environment)))
df_filt <- df_filt %>%
mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_environment, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_environment, data = df_filt)$p.value
# 4. Tukey HSD + Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_environment`[, "p adj"])$Letters
letters_df <- data.frame(IQF_environment = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQF_environment) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine letters + means
labels_df <- left_join(means_df, letters_df, by = "IQF_environment")
# 7. Test summary
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQF_environment, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above
geom_text(
data = labels_df,
aes(x = IQF_environment, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below
geom_text(
data = labels_df,
aes(x = IQF_environment, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQF_environment",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Environment Type",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQF_environment with CA_typology