This report extends the spatial analysis framework to all measured phenotypes in the inv4m field experiment. We systematically analyze nine traits (DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA) using a consistent analytical pipeline that:
library(tidyverse)
library(dplyr)
library(ggplot2)
library(nlme)
library(gstat)
library(emmeans)
library(knitr)
library(ggpubr)
library(ggtext) # For markdown text in plots
library(VIM) # For missing data visualization
library(mgcv) # For GAM models if needed
# Load the dataset
file_path <- "~/Desktop/CLY25_Inv4m.csv"
if (!file.exists(file_path)) {
stop("Error: CLY25_Inv4m.csv not found in the working directory.")
}
field_data_raw <- read.csv(file_path, na.strings = c("","#N/A","NA"))
# Clean and prepare data
field_data <- field_data_raw %>%
# Calculate Estimated Blade Area (EBA) = 0.75 * BL * BW FIRST
mutate(EBA = 0.75 * BL * BW) %>%
rename(
plant_id = plant,
block = rep,
x = X_pos,
y = Y_pos,
inv4m = inv4m_gt
) %>%
# Convert relevant columns to factors
mutate(across(c(plot_id, block, donor, inv4m), as.factor)) %>%
# Add small amount of noise to x coordinates to avoid identical positions
mutate(x = x + runif(n(), min = 0.0, max = 0.01))
# Check data types after cleaning
cat("Data types after cleaning:\n")
## Data types after cleaning:
str(field_data[c("DTA", "DTS", "DTA_GDD", "DTS_GDD", "LAE", "PH", "EN", "SL", "BL", "BW","EBA")])
## 'data.frame': 442 obs. of 11 variables:
## $ DTA : int 77 77 80 80 78 79 77 75 78 75 ...
## $ DTS : int 79 79 81 81 80 81 77 76 78 76 ...
## $ DTA_GDD: int 1915 1915 2011 2011 1943 1975 1915 1857 1943 1857 ...
## $ DTS_GDD: int 1975 1975 2047 2047 2011 2047 1915 1888 1943 1888 ...
## $ LAE : int 5 5 5 5 5 5 6 6 5 6 ...
## $ PH : int 192 209 208 212 218 225 214 186 185 184 ...
## $ EN : int 2 3 3 3 3 3 3 3 3 3 ...
## $ SL : num 13.5 14 NA 14 13.5 14 13 15.5 13 NA ...
## $ BL : num 53 50.5 NA 53.5 56 64 58 45 45.5 NA ...
## $ BW : num 8 7.5 NA 7.5 8 8.5 8 8.5 7.5 NA ...
## $ EBA : num 318 284 NA 301 336 ...
# Define phenotypes to analyze - all traits from DTA to BW
# Check actual column names first
cat("Available columns in dataset:\n")
## Available columns in dataset:
print(names(field_data_raw))
## [1] "plant" "row_id" "plot_id" "plot_row" "plot_col" "rep"
## [7] "Y_pos" "X_pos" "donor" "inv4m_gt" "group" "DOA"
## [13] "DOS" "DTA" "DTS" "DTA_GDD" "DTS_GDD" "LAE"
## [19] "PH" "EN" "SL" "BL" "BW"
# Define phenotypes based on actual column names from DTA to BW
phenotypes <- c("DTA", "DTS", "DTA_GDD", "DTS_GDD", "LAE", "PH", "EN", "SL", "BL", "BW","EBA")
cat("Requested phenotypes for analysis:\n")
## Requested phenotypes for analysis:
print(phenotypes)
## [1] "DTA" "DTS" "DTA_GDD" "DTS_GDD" "LAE" "PH" "EN"
## [8] "SL" "BL" "BW" "EBA"
# Verify all phenotypes exist in the dataset
missing_cols <- setdiff(phenotypes, names(field_data))
if (length(missing_cols) > 0) {
cat("Warning: These phenotype columns are missing from the dataset:\n")
print(missing_cols)
}
available_phenotypes <- intersect(phenotypes, names(field_data))
cat("Phenotypes available for analysis:\n")
## Phenotypes available for analysis:
print(available_phenotypes)
## [1] "DTA" "DTS" "DTA_GDD" "DTS_GDD" "LAE" "PH" "EN"
## [8] "SL" "BL" "BW" "EBA"
# Update phenotypes list to only include available columns
phenotypes <- available_phenotypes
cat("Final phenotypes list for analysis:\n")
## Final phenotypes list for analysis:
print(phenotypes)
## [1] "DTA" "DTS" "DTA_GDD" "DTS_GDD" "LAE" "PH" "EN"
## [8] "SL" "BL" "BW" "EBA"
cat("Number of phenotypes:", length(phenotypes), "\n")
## Number of phenotypes: 11
cat("EBA in final list:", "EBA" %in% phenotypes, "\n")
## EBA in final list: TRUE
cat("GDD variables in list:", any(grepl("GDD", phenotypes)), "\n")
## GDD variables in list: TRUE
cat("EBA exists in field_data:", "EBA" %in% names(field_data), "\n")
## EBA exists in field_data: TRUE
if ("EBA" %in% names(field_data)) {
cat("EBA summary statistics:\n")
print(summary(field_data$EBA))
}
## EBA summary statistics:
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 209.6 326.4 369.0 369.6 408.7 555.2 72
cat("Original data dimensions:", dim(field_data), "\n")
## Original data dimensions: 442 24
head(field_data)
## plant_id row_id plot_id plot_row plot_col block y x donor inv4m
## 1 1 4573 1 1 1 3 1.7 1.008627 TMEX INV4M
## 2 2 4573 1 1 1 3 2.3 1.000498 TMEX INV4M
## 3 3 4573 1 1 1 3 3.6 1.000701 TMEX INV4M
## 4 4 4573 1 1 1 3 4.0 1.004691 TMEX INV4M
## 5 5 4573 1 1 1 3 4.5 1.000889 TMEX INV4M
## 6 6 4573 1 1 1 3 5.0 1.000222 TMEX INV4M
## group DOA DOS DTA DTS DTA_GDD DTS_GDD LAE PH EN SL BL BW EBA
## 1 TMEX-INV4M 20 22 77 79 1915 1975 5 192 2 13.5 53.0 8.0 318.0000
## 2 TMEX-INV4M 20 22 77 79 1915 1975 5 209 3 14.0 50.5 7.5 284.0625
## 3 TMEX-INV4M 23 24 80 81 2011 2047 5 208 3 NA NA NA NA
## 4 TMEX-INV4M 23 24 80 81 2011 2047 5 212 3 14.0 53.5 7.5 300.9375
## 5 TMEX-INV4M 21 23 78 80 1943 2011 5 218 3 13.5 56.0 8.0 336.0000
## 6 TMEX-INV4M 22 24 79 81 1975 2047 5 225 3 14.0 64.0 8.5 408.0000
Understanding missing data patterns is crucial for valid statistical inference.
# Use the correct phenotypes list - NO GDD, INCLUDE EBA
missing_data_phenotypes <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
# Create missing data summary for correct phenotypes
missing_summary <- field_data %>%
select(all_of(missing_data_phenotypes)) %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
mutate(
total_obs = nrow(field_data),
missing_pct = round(missing_count / total_obs * 100, 1),
available_n = total_obs - missing_count
)
print(missing_summary)
## # A tibble: 9 × 5
## phenotype missing_count total_obs missing_pct available_n
## <chr> <int> <int> <dbl> <int>
## 1 DTA 0 442 0 442
## 2 DTS 0 442 0 442
## 3 LAE 1 442 0.2 441
## 4 PH 0 442 0 442
## 5 EN 0 442 0 442
## 6 SL 47 442 10.6 395
## 7 BL 57 442 12.9 385
## 8 BW 62 442 14 380
## 9 EBA 72 442 16.3 370
# Use the correct phenotypes list for missing data visualization
missing_data_phenotypes <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
# Visualize missing data patterns for correct phenotypes
field_data %>%
select(all_of(missing_data_phenotypes)) %>%
VIM::aggr(col = c('navyblue', 'red'),
numbers = TRUE,
sortVars = TRUE,
labels = missing_data_phenotypes)
Missing data patterns across all phenotypes including EBA
##
## Variables sorted by number of missings:
## Variable Count
## EBA 0.162895928
## BW 0.140271493
## BL 0.128959276
## SL 0.106334842
## LAE 0.002262443
## DTA 0.000000000
## DTS 0.000000000
## PH 0.000000000
## EN 0.000000000
# Check if missing data is related to treatments
missing_by_treatment <- field_data %>%
select(donor, inv4m, all_of(phenotypes)) %>%
group_by(donor, inv4m) %>%
summarise(across(all_of(phenotypes), ~sum(is.na(.))), .groups = 'drop')
cat("Missing data counts by treatment combination:\n")
## Missing data counts by treatment combination:
print(missing_by_treatment)
## # A tibble: 4 × 13
## donor inv4m DTA DTS DTA_GDD DTS_GDD LAE PH EN SL BL BW
## <fct> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 MI21 CTRL 0 0 0 0 0 0 0 10 12 16
## 2 MI21 INV4M 0 0 0 0 0 0 0 13 15 19
## 3 TMEX CTRL 0 0 0 0 0 0 0 12 15 13
## 4 TMEX INV4M 0 0 0 3 1 0 0 12 15 14
## # ℹ 1 more variable: EBA <int>
# Check spatial distribution of missing data
field_data %>%
select(x, y, all_of(phenotypes)) %>%
mutate(any_missing = rowSums(is.na(select(., all_of(phenotypes)))) > 0) %>%
ggplot(aes(x = x, y = y, color = any_missing)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
labs(title = "Spatial distribution of missing observations",
color = "Has missing\ndata") +
theme_classic() +
coord_equal()
Comparing spatial autocorrelation patterns across phenotypes using standardized variograms.
#' Calculate scaled variogram for a phenotype
#'
#' @param data Complete-case dataset for the phenotype
#' @param phenotype_name Name of the phenotype column
#' @return List containing variogram data and scaling info
calculate_scaled_variogram <- function(data, phenotype_name) {
# Fit simple model to get residuals
formula_str <- paste(phenotype_name, "~ donor * inv4m")
m_simple <- lm(as.formula(formula_str), data = data)
# Add residuals to data
data$resids <- NA
data$resids[as.integer(rownames(m_simple$model))] <- residuals(m_simple)
# Create gstat object and calculate variogram
g_obj <- gstat(id = "resids",
formula = resids ~ 1,
locations = ~x+y,
data = data %>% filter(!is.na(resids)))
vgm_data <- variogram(g_obj)
# Scale to 0-100
max_gamma <- max(vgm_data$gamma)
vgm_data$gamma_scaled <- (vgm_data$gamma / max_gamma) * 100
return(list(
variogram = vgm_data,
max_gamma = max_gamma,
n_obs = nrow(data),
phenotype = phenotype_name
))
}
# FORCE the correct phenotypes list for variograms - NO GDD, INCLUDE EBA
variogram_phenotypes <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
# Calculate scaled variograms for each phenotype
variogram_results <- list()
# First, let's check sample sizes for all traits
cat("=== SAMPLE SIZE CHECK FOR ALL TRAITS ===\n")
## === SAMPLE SIZE CHECK FOR ALL TRAITS ===
for (trait in variogram_phenotypes) {
if (trait %in% names(field_data)) {
trait_data <- field_data %>%
filter(!is.na(.data[[trait]]) &
!is.na(x) & !is.na(y) &
!is.na(donor) & !is.na(inv4m) & !is.na(block))
cat(trait, ": n =", nrow(trait_data), "\n")
} else {
cat(trait, ": COLUMN NOT FOUND in field_data\n")
}
}
## DTA : n = 442
## DTS : n = 442
## LAE : n = 441
## PH : n = 442
## EN : n = 442
## SL : n = 395
## BL : n = 385
## BW : n = 380
## EBA : n = 370
cat("\n=== PROCESSING VARIOGRAMS ===\n")
##
## === PROCESSING VARIOGRAMS ===
for (trait in variogram_phenotypes) {
cat("Processing variogram for", trait, "...\n")
# Check if trait exists in data
if (!trait %in% names(field_data)) {
cat(" ERROR: Column", trait, "not found in field_data\n")
next
}
# Create complete-case dataset for this trait
trait_data <- field_data %>%
filter(!is.na(.data[[trait]]) &
!is.na(x) & !is.na(y) &
!is.na(donor) & !is.na(inv4m) & !is.na(block))
cat(" Sample size:", nrow(trait_data), "\n")
if (nrow(trait_data) > 10) { # Lowered threshold for minimum observations
tryCatch({
variogram_results[[trait]] <- calculate_scaled_variogram(trait_data, trait)
cat(" Successfully calculated variogram\n")
}, error = function(e) {
cat(" ERROR calculating variogram:", e$message, "\n")
})
} else {
cat(" Warning: Insufficient data for", trait, "(need > 10 observations)\n")
}
}
## Processing variogram for DTA ...
## Sample size: 442
## Successfully calculated variogram
## Processing variogram for DTS ...
## Sample size: 442
## Successfully calculated variogram
## Processing variogram for LAE ...
## Sample size: 441
## Successfully calculated variogram
## Processing variogram for PH ...
## Sample size: 442
## Successfully calculated variogram
## Processing variogram for EN ...
## Sample size: 442
## Successfully calculated variogram
## Processing variogram for SL ...
## Sample size: 395
## Successfully calculated variogram
## Processing variogram for BL ...
## Sample size: 385
## Successfully calculated variogram
## Processing variogram for BW ...
## Sample size: 380
## Successfully calculated variogram
## Processing variogram for EBA ...
## Sample size: 370
## Successfully calculated variogram
cat("\nVariograms successfully calculated for:", length(variogram_results), "traits\n")
##
## Variograms successfully calculated for: 9 traits
cat("Traits with variograms:", names(variogram_results), "\n")
## Traits with variograms: DTA DTS LAE PH EN SL BL BW EBA
# Create summary table
if (length(variogram_results) > 0) {
vgm_summary <- map_dfr(variogram_results, function(x) {
tibble(
phenotype = x$phenotype,
n_obs = x$n_obs,
max_semivariance = round(x$max_gamma, 3)
)
})
cat("\n=== VARIOGRAM SUMMARY STATISTICS ===\n")
print(vgm_summary)
} else {
cat("No variograms could be calculated!\n")
}
##
## === VARIOGRAM SUMMARY STATISTICS ===
## # A tibble: 9 × 3
## phenotype n_obs max_semivariance
## <chr> <int> <dbl>
## 1 DTA 442 2.53
## 2 DTS 442 2.67
## 3 LAE 441 0.497
## 4 PH 442 94.2
## 5 EN 442 0.227
## 6 SL 395 0.369
## 7 BL 385 26.3
## 8 BW 380 0.627
## 9 EBA 370 3487.
# Combine all variogram data
all_vgm_data <- map_dfr(variogram_results, function(x) {
x$variogram %>%
mutate(phenotype = x$phenotype,
n_obs = x$n_obs)
})
# Create single combined variogram plot for direct comparison
ggplot(all_vgm_data, aes(x = dist, y = gamma_scaled, color = phenotype)) +
geom_point(alpha = 0.7, size = 1.5) +
geom_line(alpha = 0.8, linewidth = 0.8) +
scale_color_brewer(type = "qual", palette = "Set3") +
labs(
title = "Scaled Empirical Variograms - All Phenotypes",
subtitle = "Scaled to 0-100 for direct comparison of spatial autocorrelation patterns",
x = "Distance",
y = "Scaled Semivariance (0-100)",
color = "Phenotype"
) +
theme_classic2(base_size = 12) +
theme(
legend.position = "right",
legend.title = element_text(face = "bold"),
plot.title = element_text(face = "bold")
) +
guides(color = guide_legend(override.aes = list(alpha = 1, size = 2)))
Scaled empirical variograms for all phenotypes (0-100 scale) - single plot for direct comparison
#' Create spatial plot for a phenotype
#'
#' @param data Dataset containing the phenotype
#' @param col_var Phenotype column name
#' @param plot_title Plot title
#' @param legend_name Legend title
#' @param x_lab X-axis label
create_spatial_plot <- function(data, col_var, plot_title, legend_name, x_lab = "") {
col_sym <- ensym(col_var)
data %>%
filter(!is.na(!!col_sym)) %>%
ggplot(aes(x = x, y = y)) +
geom_point(aes(color = !!col_sym), size = 1.2, alpha = 0.8) +
scale_color_distiller(palette = "RdYlGn", direction = 1, name = legend_name) +
labs(title = plot_title, x = x_lab, y = "Field Y position") +
theme_classic2(base_size = 10) +
theme(legend.position = "right") +
coord_equal()
}
# Group 1: Flowering traits (days and GDD)
plot_dta <- create_spatial_plot(field_data, DTA, "Days to Anthesis", "days")
plot_dts <- create_spatial_plot(field_data, DTS, "Days to Silking", "days")
plot_dta_gdd <- create_spatial_plot(field_data, DTA_GDD, "Anthesis GDD", "GDD", x_lab = "Field X position")
ggarrange(plot_dta,
plot_dts + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
plot_dta_gdd + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
nrow = 1, widths = c(1.2, 1, 1))
Spatial distribution of phenotypes across the experimental field
# Group 2: More flowering traits and plant architecture
plot_dts_gdd <- create_spatial_plot(field_data, DTS_GDD, "Silking GDD", "GDD")
plot_lae <- create_spatial_plot(field_data, LAE, "Leaves Above Ear", "count")
plot_ph <- create_spatial_plot(field_data, PH, "Plant Height", "cm", x_lab = "Field X position")
ggarrange(plot_dts_gdd,
plot_lae + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
plot_ph + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
nrow = 1, widths = c(1.2, 1, 1))
Spatial distribution of phenotypes across the experimental field
# Group 3: Morphological and reproductive traits
plot_en <- create_spatial_plot(field_data, EN, "Ear Number", "count")
plot_sl <- create_spatial_plot(field_data, SL, "Sheath Length", "cm")
plot_bl <- create_spatial_plot(field_data, BL, "Blade Length", "cm")
ggarrange(plot_en,
plot_sl + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
plot_bl + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
nrow = 1, widths = c(1.2, 1, 1))
Spatial distribution of phenotypes across the experimental field
# Group 4: Final morphological traits
if ("EBA" %in% names(field_data)) {
plot_bw <- create_spatial_plot(field_data, BW, "Blade Width", "cm")
plot_eba <- create_spatial_plot(field_data, EBA, "Estimated Blade Area", "cm²", x_lab = "Field X position")
ggarrange(plot_bw,
plot_eba + theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()),
nrow = 1, widths = c(1.2, 1))
} else {
cat("EBA column not found in field_data. Available columns:\n")
print(names(field_data))
plot_bw <- create_spatial_plot(field_data, BW, "Blade Width", "cm", x_lab = "Field X position")
print(plot_bw)
}
Spatial distribution of phenotypes across the experimental field
#' Fit all six model structures for a given phenotype
#'
#' @param data Complete-case dataset for the phenotype
#' @param phenotype_name Name of the phenotype column
#' @return List of fitted models with error handling
fit_all_models <- function(data, phenotype_name) {
formula_str <- paste(phenotype_name, "~ donor * inv4m")
models <- list()
# Create plot-level data for Model 1
plot_data <- data %>%
group_by(plot_id, plot_row, plot_col, block, donor, inv4m) %>%
summarise(trait_mean = mean(.data[[phenotype_name]], na.rm = TRUE),
.groups = 'drop')
# Model 1: Plot means baseline
cat(" Attempting Model 1...\n")
model_1 <- tryCatch({
lm(trait_mean ~ poly(plot_row, 2) + poly(plot_col, 2) + block + donor * inv4m,
data = plot_data)
}, error = function(e) {
cat(" Model 1 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_1"]] <- model_1
if (!is.null(model_1)) cat(" Model 1: SUCCESS\n")
# Model 2: Spatial correlation only
cat(" Attempting Model 2...\n")
model_2 <- tryCatch({
gls(as.formula(formula_str),
correlation = corSpher(form = ~ x + y | block/plot_id, nugget = TRUE),
data = data, method = "REML")
}, error = function(e) {
cat(" Model 2 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_2"]] <- model_2
if (!is.null(model_2)) cat(" Model 2: SUCCESS\n")
# Model 3: Plot random effects
cat(" Attempting Model 3...\n")
model_3 <- tryCatch({
lme(as.formula(paste(phenotype_name, "~ donor * inv4m + block")),
random = ~ 1 | plot_id, data = data, method = "REML")
}, error = function(e) {
cat(" Model 3 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_3"]] <- model_3
if (!is.null(model_3)) cat(" Model 3: SUCCESS\n")
# Model 4: Plot random + gradients
cat(" Attempting Model 4...\n")
model_4 <- tryCatch({
lme(as.formula(paste(phenotype_name, "~ donor * inv4m + block + plot_row + plot_col")),
random = ~ 1 | plot_id, data = data, method = "REML")
}, error = function(e) {
cat(" Model 4 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_4"]] <- model_4
if (!is.null(model_4)) cat(" Model 4: SUCCESS\n")
# Model 5: Block random + spatial correlation
cat(" Attempting Model 5...\n")
model_5 <- tryCatch({
lme(as.formula(formula_str),
random = ~ 1 | block,
correlation = corSpher(form = ~ x + y | block),
data = data, method = "REML")
}, error = function(e) {
cat(" Model 5 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_5"]] <- model_5
if (!is.null(model_5)) cat(" Model 5: SUCCESS\n")
# Model 6: Full hierarchical + spatial polynomials
cat(" Attempting Model 6...\n")
model_6 <- tryCatch({
lme(as.formula(paste(phenotype_name, "~ donor * inv4m + poly(x, 2) + poly(y, 2)")),
random = ~ 1 | block/plot_id, data = data, method = "REML")
}, error = function(e) {
cat(" Model 6 FAILED:", e$message, "\n")
return(NULL)
})
models[["model_6"]] <- model_6
if (!is.null(model_6)) cat(" Model 6: SUCCESS\n")
return(models)
}
#' Extract model comparison statistics
#'
#' @param models List of fitted models
#' @param phenotype_name Name of the phenotype
#' @return Data frame with model comparison metrics
extract_model_stats <- function(models, phenotype_name) {
cat(" Extracting stats for", phenotype_name, "\n")
cat(" Models available:", names(models), "\n")
cat(" Non-null models:", sum(!sapply(models, is.null)), "\n")
# Initialize tibble with proper column structure
model_stats <- tibble(
phenotype = character(),
model = character(),
AIC = numeric(),
BIC = numeric(),
logLik = numeric(),
converged = logical()
)
for (i in 1:6) {
model_name <- paste0("model_", i)
model <- models[[model_name]]
cat(" Processing", model_name, "...")
# Check if model exists and is a proper model object
if (!is.null(model)) {
cat(" exists, class:", class(model)[1])
tryCatch({
# Extract stats based on model class
if (inherits(model, "lm")) {
# For lm objects
aic_val <- AIC(model)
bic_val <- BIC(model)
ll_val <- as.numeric(logLik(model))
model_stats <- bind_rows(model_stats, tibble(
phenotype = phenotype_name,
model = model_name,
AIC = aic_val,
BIC = bic_val,
logLik = ll_val,
converged = TRUE
))
cat(" - SUCCESS (AIC:", round(aic_val, 1), ", BIC:", round(bic_val, 1), ")\n")
} else if (inherits(model, c("lme", "gls"))) {
# For lme/gls objects - check if they have proper logLik
aic_val <- AIC(model)
bic_val <- BIC(model)
ll_val <- as.numeric(logLik(model))
model_stats <- bind_rows(model_stats, tibble(
phenotype = phenotype_name,
model = model_name,
AIC = aic_val,
BIC = bic_val,
logLik = ll_val,
converged = TRUE
))
cat(" - SUCCESS (AIC:", round(aic_val, 1), ", BIC:", round(bic_val, 1), ")\n")
} else {
cat(" - Unknown class:", class(model), "\n")
}
}, error = function(e) {
cat(" - ERROR:", e$message, "\n")
# Add row with NAs for failed extraction
model_stats <<- bind_rows(model_stats, tibble(
phenotype = phenotype_name,
model = model_name,
AIC = NA_real_,
BIC = NA_real_,
logLik = NA_real_,
converged = FALSE
))
})
} else {
cat(" - NULL (failed to fit)\n")
# Model is NULL (failed to fit)
model_stats <- bind_rows(model_stats, tibble(
phenotype = phenotype_name,
model = model_name,
AIC = NA_real_,
BIC = NA_real_,
logLik = NA_real_,
converged = FALSE
))
}
}
cat(" Final stats for", phenotype_name, ":", nrow(model_stats), "rows\n")
return(model_stats)
}
# Initialize results storage
all_models <- list()
all_model_stats <- tibble()
model_fitting_diagnostics <- tibble()
# Update phenotypes list for model fitting to match what we want
phenotypes_for_models <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
# Fit models for each phenotype
for (trait in phenotypes_for_models) {
cat("\n=== FITTING MODELS FOR", trait, "===\n")
tryCatch({
# Create complete-case dataset
trait_data <- field_data %>%
filter(!is.na(.data[[trait]]) &
!is.na(x) & !is.na(y) &
!is.na(donor) & !is.na(inv4m) &
!is.na(block) & !is.na(plot_id) &
!is.na(plot_row) & !is.na(plot_col))
cat(" Sample size:", nrow(trait_data), "\n")
# Record diagnostics for this trait
trait_diagnostics <- tibble(
phenotype = trait,
sample_size = nrow(trait_data),
n_donors = length(unique(trait_data$donor)),
n_inv4m_levels = length(unique(trait_data$inv4m)),
n_blocks = length(unique(trait_data$block)),
n_plots = length(unique(trait_data$plot_id)),
trait_range = paste(round(range(trait_data[[trait]], na.rm = TRUE), 2), collapse = " - "),
models_attempted = 0,
models_successful = 0,
failed_models = "",
error_messages = ""
)
if (nrow(trait_data) > 50) { # Minimum sample size check
# Fit all models
cat(" Fitting models...\n")
trait_models <- fit_all_models(trait_data, trait)
all_models[[trait]] <- trait_models
# Count successful models
successful_models <- names(trait_models)[!sapply(trait_models, is.null)]
failed_models <- names(trait_models)[sapply(trait_models, is.null)]
trait_diagnostics$models_attempted <- 6
trait_diagnostics$models_successful <- length(successful_models)
trait_diagnostics$failed_models <- paste(failed_models, collapse = ", ")
cat(" Successful models:", length(successful_models), "out of 6\n")
if (length(failed_models) > 0) {
cat(" Failed models:", paste(failed_models, collapse = ", "), "\n")
}
# Extract statistics
if (length(successful_models) > 0) {
cat(" Extracting model statistics...\n")
trait_stats <- extract_model_stats(trait_models, trait)
cat(" Extracted", nrow(trait_stats), "rows of statistics\n")
all_model_stats <- bind_rows(all_model_stats, trait_stats)
cat(" Total model stats now:", nrow(all_model_stats), "rows\n")
} else {
cat(" No successful models for", trait, "\n")
}
} else {
cat(" Warning: Insufficient data for", trait, "\n")
trait_diagnostics$error_messages <- "Insufficient sample size (< 50 observations)"
}
# Add diagnostics
model_fitting_diagnostics <- bind_rows(model_fitting_diagnostics, trait_diagnostics)
}, error = function(e) {
cat(" MAJOR ERROR processing", trait, ":", e$message, "\n")
# Still add basic diagnostics even if everything failed
error_diagnostics <- tibble(
phenotype = trait,
sample_size = NA,
n_donors = NA,
n_inv4m_levels = NA,
n_blocks = NA,
n_plots = NA,
trait_range = NA,
models_attempted = 0,
models_successful = 0,
failed_models = "",
error_messages = paste("Major error:", e$message)
)
model_fitting_diagnostics <<- bind_rows(model_fitting_diagnostics, error_diagnostics)
})
}
##
## === FITTING MODELS FOR DTA ===
## Sample size: 442
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for DTA
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 175.9 , BIC: 202 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 1540.6 , BIC: 1569.2 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 1549.3 , BIC: 1586 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 1554.5 , BIC: 1599.3 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 1629.3 , BIC: 1657.9 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 1508.2 , BIC: 1553 )
## Final stats for DTA : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 6 rows
##
## === FITTING MODELS FOR DTS ===
## Sample size: 442
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for DTS
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 188.4 , BIC: 214.5 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 1704.9 , BIC: 1733.5 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 1625.2 , BIC: 1661.8 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 1630.1 , BIC: 1674.9 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 1676.2 , BIC: 1704.8 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 1588.1 , BIC: 1632.9 )
## Final stats for DTS : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 12 rows
##
## === FITTING MODELS FOR LAE ===
## Sample size: 441
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for LAE
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 31.9 , BIC: 58 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 904.3 , BIC: 932.9 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 711.9 , BIC: 748.6 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 718.1 , BIC: 762.9 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 738.5 , BIC: 767 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 700.4 , BIC: 745.1 )
## Final stats for LAE : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 18 rows
##
## === FITTING MODELS FOR PH ===
## Sample size: 442
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for PH
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 438.4 , BIC: 464.5 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 3233.4 , BIC: 3262 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 3155.1 , BIC: 3191.7 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 3154.8 , BIC: 3199.6 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 3233.3 , BIC: 3261.9 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 3132.1 , BIC: 3176.9 )
## Final stats for PH : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 24 rows
##
## === FITTING MODELS FOR EN ===
## Sample size: 442
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for EN
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 12.7 , BIC: 38.8 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 619.3 , BIC: 647.9 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 598 , BIC: 634.7 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 609.5 , BIC: 654.3 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 610.3 , BIC: 638.9 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 579.1 , BIC: 623.9 )
## Final stats for EN : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 30 rows
##
## === FITTING MODELS FOR SL ===
## Sample size: 395
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for SL
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 29.1 , BIC: 55.2 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 743.5 , BIC: 771.3 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 723.7 , BIC: 759.4 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 736.1 , BIC: 779.6 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 722.7 , BIC: 750.5 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 720.6 , BIC: 764.2 )
## Final stats for SL : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 36 rows
##
## === FITTING MODELS FOR BL ===
## Sample size: 385
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for BL
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 321.3 , BIC: 347.4 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 2242.5 , BIC: 2270.1 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 2250.6 , BIC: 2286.1 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 2252.7 , BIC: 2295.9 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 2277.6 , BIC: 2305.2 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 2226.3 , BIC: 2269.5 )
## Final stats for BL : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 42 rows
##
## === FITTING MODELS FOR BW ===
## Sample size: 380
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for BW
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 104.8 , BIC: 130.9 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 891.3 , BIC: 918.8 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 877.8 , BIC: 913.1 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 888.5 , BIC: 931.6 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 891.3 , BIC: 918.8 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 865.4 , BIC: 908.5 )
## Final stats for BW : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 48 rows
##
## === FITTING MODELS FOR EBA ===
## Sample size: 370
## Fitting models...
## Attempting Model 1...
## Model 1: SUCCESS
## Attempting Model 2...
## Model 2: SUCCESS
## Attempting Model 3...
## Model 3: SUCCESS
## Attempting Model 4...
## Model 4: SUCCESS
## Attempting Model 5...
## Model 5: SUCCESS
## Attempting Model 6...
## Model 6: SUCCESS
## Successful models: 6 out of 6
## Extracting model statistics...
## Extracting stats for EBA
## Models available: model_1 model_2 model_3 model_4 model_5 model_6
## Non-null models: 6
## Processing model_1 ... exists, class: lm - SUCCESS (AIC: 654.4 , BIC: 680.4 )
## Processing model_2 ... exists, class: gls - SUCCESS (AIC: 4018.6 , BIC: 4046 )
## Processing model_3 ... exists, class: lme - SUCCESS (AIC: 3972 , BIC: 4007 )
## Processing model_4 ... exists, class: lme - SUCCESS (AIC: 3965.8 , BIC: 4008.6 )
## Processing model_5 ... exists, class: lme - SUCCESS (AIC: 4016.4 , BIC: 4043.7 )
## Processing model_6 ... exists, class: lme - SUCCESS (AIC: 3947.6 , BIC: 3990.4 )
## Final stats for EBA : 6 rows
## Extracted 6 rows of statistics
## Total model stats now: 54 rows
cat("\n=== FINAL SUMMARY ===\n")
##
## === FINAL SUMMARY ===
cat("Processed", length(all_models), "phenotypes out of", length(phenotypes), "requested\n")
## Processed 9 phenotypes out of 11 requested
cat("Phenotypes with models:", names(all_models), "\n")
## Phenotypes with models: DTA DTS LAE PH EN SL BL BW EBA
cat("Missing phenotypes:", setdiff(phenotypes, names(all_models)), "\n")
## Missing phenotypes: DTA_GDD DTS_GDD
# Display model fitting diagnostics
cat("\n=== MODEL FITTING DIAGNOSTICS ===\n")
##
## === MODEL FITTING DIAGNOSTICS ===
print(model_fitting_diagnostics)
## # A tibble: 9 × 11
## phenotype sample_size n_donors n_inv4m_levels n_blocks n_plots trait_range
## <chr> <int> <int> <int> <int> <int> <chr>
## 1 DTA 442 2 2 4 64 74 - 83
## 2 DTS 442 2 2 4 64 74 - 84
## 3 LAE 441 2 2 4 64 4 - 8
## 4 PH 442 2 2 4 64 159 - 225
## 5 EN 442 2 2 4 64 2 - 3
## 6 SL 395 2 2 4 64 11.5 - 15.5
## 7 BL 385 2 2 4 64 42.5 - 71.5
## 8 BW 380 2 2 4 64 6.5 - 10.5
## 9 EBA 370 2 2 4 64 209.62 - 555.19
## # ℹ 4 more variables: models_attempted <dbl>, models_successful <int>,
## # failed_models <chr>, error_messages <chr>
# Summary of model fitting success
cat("\n=== MODEL FITTING SUMMARY ===\n")
##
## === MODEL FITTING SUMMARY ===
successful_traits <- model_fitting_diagnostics %>%
filter(models_successful > 0) %>%
nrow()
cat("Traits with successful model fits:", successful_traits, "out of", length(phenotypes), "\n")
## Traits with successful model fits: 9 out of 11
failed_traits <- model_fitting_diagnostics %>%
filter(models_successful == 0)
if (nrow(failed_traits) > 0) {
cat("\nTraits that failed all model fitting:\n")
for (i in 1:nrow(failed_traits)) {
cat(" -", failed_traits$phenotype[i], ":", failed_traits$error_messages[i], "\n")
}
}
# Model-specific failure patterns
if (nrow(all_model_stats) > 0) {
model_failure_summary <- all_model_stats %>%
group_by(model) %>%
summarise(
n_attempted = n(),
n_successful = sum(!is.na(BIC)),
n_failed = sum(is.na(BIC)),
success_rate = round(n_successful / n_attempted * 100, 1),
.groups = 'drop'
)
cat("\n=== MODEL-SPECIFIC SUCCESS RATES ===\n")
print(model_failure_summary)
} else {
cat("No model statistics available\n")
}
##
## === MODEL-SPECIFIC SUCCESS RATES ===
## # A tibble: 6 × 5
## model n_attempted n_successful n_failed success_rate
## <chr> <int> <int> <int> <dbl>
## 1 model_1 9 9 0 100
## 2 model_2 9 9 0 100
## 3 model_3 9 9 0 100
## 4 model_4 9 9 0 100
## 5 model_5 9 9 0 100
## 6 model_6 9 9 0 100
# Check if we have any model statistics
cat("Total model statistics available:", nrow(all_model_stats), "\n")
## Total model statistics available: 54
if (nrow(all_model_stats) > 0) {
cat("Columns in all_model_stats:", names(all_model_stats), "\n")
cat("Unique phenotypes:", length(unique(all_model_stats$phenotype)), "\n")
cat("Unique models:", length(unique(all_model_stats$model)), "\n")
cat("First few rows:\n")
print(head(all_model_stats))
} else {
cat("ERROR: No model statistics available. Check model fitting process.\n")
}
## Columns in all_model_stats: phenotype model AIC BIC logLik converged
## Unique phenotypes: 9
## Unique models: 6
## First few rows:
## # A tibble: 6 × 6
## phenotype model AIC BIC logLik converged
## <chr> <chr> <dbl> <dbl> <dbl> <lgl>
## 1 DTA model_1 176. 202. -76.0 TRUE
## 2 DTA model_2 1541. 1569. -763. TRUE
## 3 DTA model_3 1549. 1586. -766. TRUE
## 4 DTA model_4 1554. 1599. -766. TRUE
## 5 DTA model_5 1629. 1658. -808. TRUE
## 6 DTA model_6 1508. 1553. -743. TRUE
# Only proceed if we have data
if (nrow(all_model_stats) > 0) {
# BIC comparison table - ensure all combinations are present, exclude model_1
bic_matrix <- all_model_stats %>%
filter(model != "model_1") %>% # Exclude model_1 from comparison
select(phenotype, model, BIC) %>%
complete(phenotype, model, fill = list(BIC = NA)) %>% # Fill missing combinations with NA
pivot_wider(names_from = model, values_from = BIC)
cat("BIC matrix dimensions:", dim(bic_matrix), "\n")
cat("BIC matrix columns:", names(bic_matrix), "\n")
# Convert to matrix for kable, handling the phenotype column
if (nrow(bic_matrix) > 0 && ncol(bic_matrix) > 1) {
# Check if phenotype column exists
if ("phenotype" %in% names(bic_matrix)) {
# Use a more robust approach to create the table
bic_table <- bic_matrix %>%
select(-phenotype) %>%
as.data.frame()
# Set row names manually
rownames(bic_table) <- bic_matrix$phenotype
cat("Final BIC table dimensions:", dim(bic_table), "\n")
cat("Final BIC table column names:", colnames(bic_table), "\n")
cat("Final BIC table row names:", rownames(bic_table), "\n")
# Try simple printing first to avoid kable issues
cat("\n=== BIC VALUES FOR MIXED-EFFECTS MODELS (LOWER IS BETTER) ===\n")
cat("Note: Model_1 (fixed effects only) excluded from comparison\n")
print(round(bic_table, 1))
} else {
cat("Warning: No 'phenotype' column found in BIC matrix\n")
print(bic_matrix)
}
} else {
cat("BIC matrix is empty or has insufficient columns\n")
print(bic_matrix)
}
# Best model for each phenotype (excluding model_1)
best_models <- all_model_stats %>%
filter(!is.na(BIC) & model != "model_1") %>% # Exclude model_1 from selection
group_by(phenotype) %>%
slice_min(BIC, n = 1) %>%
select(phenotype, best_model = model, best_BIC = BIC)
if (nrow(best_models) > 0) {
cat("\n=== BEST MIXED-EFFECTS MODEL (LOWEST BIC) FOR EACH PHENOTYPE ===\n")
cat("Note: Model_1 (fixed effects only) excluded from selection\n")
print(best_models)
# Model selection frequency
model_frequency <- best_models %>%
count(best_model, sort = TRUE) %>%
mutate(frequency = n / nrow(best_models) * 100)
cat("\n=== FREQUENCY OF EACH MIXED-EFFECTS MODEL BEING SELECTED AS BEST ===\n")
cat("Note: Model_1 (fixed effects only) excluded from selection\n")
print(model_frequency)
} else {
cat("No valid model comparisons available\n")
}
} else {
cat("Skipping model comparison - no model statistics available\n")
}
## BIC matrix dimensions: 9 6
## BIC matrix columns: phenotype model_2 model_3 model_4 model_5 model_6
## Final BIC table dimensions: 9 5
## Final BIC table column names: model_2 model_3 model_4 model_5 model_6
## Final BIC table row names: BL BW DTA DTS EBA EN LAE PH SL
##
## === BIC VALUES FOR MIXED-EFFECTS MODELS (LOWER IS BETTER) ===
## Note: Model_1 (fixed effects only) excluded from comparison
## model_2 model_3 model_4 model_5 model_6
## BL 2270.1 2286.1 2295.9 2305.2 2269.5
## BW 918.8 913.1 931.6 918.8 908.5
## DTA 1569.2 1586.0 1599.3 1657.9 1553.0
## DTS 1733.5 1661.8 1674.9 1704.8 1632.9
## EBA 4046.0 4007.0 4008.6 4043.7 3990.4
## EN 647.9 634.7 654.3 638.9 623.9
## LAE 932.9 748.6 762.9 767.0 745.1
## PH 3262.0 3191.7 3199.6 3261.9 3176.9
## SL 771.3 759.4 779.6 750.5 764.2
##
## === BEST MIXED-EFFECTS MODEL (LOWEST BIC) FOR EACH PHENOTYPE ===
## Note: Model_1 (fixed effects only) excluded from selection
## # A tibble: 9 × 3
## # Groups: phenotype [9]
## phenotype best_model best_BIC
## <chr> <chr> <dbl>
## 1 BL model_6 2270.
## 2 BW model_6 908.
## 3 DTA model_6 1553.
## 4 DTS model_6 1633.
## 5 EBA model_6 3990.
## 6 EN model_6 624.
## 7 LAE model_6 745.
## 8 PH model_6 3177.
## 9 SL model_5 750.
##
## === FREQUENCY OF EACH MIXED-EFFECTS MODEL BEING SELECTED AS BEST ===
## Note: Model_1 (fixed effects only) excluded from selection
## # A tibble: 9 × 4
## # Groups: phenotype [9]
## phenotype best_model n frequency
## <chr> <chr> <int> <dbl>
## 1 BL model_6 1 11.1
## 2 BW model_6 1 11.1
## 3 DTA model_6 1 11.1
## 4 DTS model_6 1 11.1
## 5 EBA model_6 1 11.1
## 6 EN model_6 1 11.1
## 7 LAE model_6 1 11.1
## 8 PH model_6 1 11.1
## 9 SL model_5 1 11.1
# Only create visualization if we have model statistics
if (nrow(all_model_stats) > 0 && "phenotype" %in% names(all_model_stats)) {
# BIC differences from best model (excluding model_1)
bic_diff <- all_model_stats %>%
filter(!is.na(BIC) & model != "model_1") %>% # Exclude model_1
group_by(phenotype) %>%
mutate(bic_diff = BIC - min(BIC)) %>%
ungroup()
ggplot(bic_diff, aes(x = phenotype, y = bic_diff, fill = model)) +
geom_col(position = "dodge") +
scale_fill_brewer(type = "qual", palette = "Set3") +
labs(
title = "BIC differences from best mixed-effects model",
subtitle = "Lower values indicate better fit (Model_1 excluded)",
x = "Phenotype",
y = "ΔBIC from best model",
fill = "Model"
) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
cat("Skipping model selection visualization - no valid model statistics available\n")
}
Model selection patterns across phenotypes
#' Extract treatment effects using emmeans from the best model
#'
#' @param models List of fitted models for a phenotype
#' @param best_model_name Name of the best model
#' @param phenotype_name Name of the phenotype
#' @return List containing emmeans results and contrasts
extract_treatment_effects <- function(models, best_model_name, phenotype_name) {
model <- models[[best_model_name]]
if (is.null(model)) {
return(NULL)
}
tryCatch({
# Calculate estimated marginal means
emm <- emmeans(model, specs = ~ inv4m | donor)
# Calculate contrasts with CTRL as reference (reverse = TRUE)
contrasts <- pairs(emm, by = "donor", reverse = TRUE)
return(list(
phenotype = phenotype_name,
model_used = best_model_name,
emmeans = emm,
contrasts = contrasts,
emm_summary = summary(emm),
contrast_summary = summary(contrasts)
))
}, error = function(e) {
cat("Error extracting effects for", phenotype_name, ":", e$message, "\n")
return(NULL)
})
}
# Check if we have model results before proceeding
if (nrow(all_model_stats) == 0 || !exists("best_models")) {
cat("No model statistics or best models available. Skipping treatment effects analysis.\n")
treatment_effects <- list()
significant_effects <- tibble()
} else {
# Extract treatment effects using best models (exclude GDD phenotypes to avoid non-independent hypotheses)
treatment_effects <- list()
# All phenotypes included in effect size analysis (no GDD to exclude)
phenotypes_for_effects <- names(all_models)
cat("Phenotypes included in effect size analysis:\n")
cat(paste(phenotypes_for_effects, collapse = ", "), "\n")
for (trait in phenotypes_for_effects) {
best_model_info <- best_models %>% filter(phenotype == trait)
if (nrow(best_model_info) > 0) {
best_model_name <- best_model_info$best_model
effects <- extract_treatment_effects(
all_models[[trait]],
best_model_name,
trait
)
if (!is.null(effects)) {
treatment_effects[[trait]] <- effects
}
}
}
# Summary of significant contrasts
significant_effects <- map_dfr(treatment_effects, function(x) {
if (!is.null(x$contrast_summary)) {
x$contrast_summary %>%
as_tibble() %>%
mutate(phenotype = x$phenotype,
model_used = x$model_used) %>%
filter(p.value < 0.05) %>%
select(phenotype, model_used, donor, estimate, SE, p.value)
}
})
}
## Phenotypes included in effect size analysis:
## DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA
if (nrow(significant_effects) > 0) {
cat("\n=== SIGNIFICANT INV4M EFFECTS (P < 0.05) BY DONOR BACKGROUND ===\n")
print(significant_effects)
} else {
cat("No significant inv4m effects detected at p < 0.05\n")
}
##
## === SIGNIFICANT INV4M EFFECTS (P < 0.05) BY DONOR BACKGROUND ===
## # A tibble: 11 × 6
## phenotype model_used donor estimate SE p.value
## <chr> <chr> <fct> <dbl> <dbl> <dbl>
## 1 DTA model_6 TMEX 1.61 0.291 0.000000784
## 2 DTS model_6 MI21 -0.695 0.289 0.0193
## 3 DTS model_6 TMEX 1.74 0.294 0.000000172
## 4 LAE model_6 TMEX -0.350 0.102 0.00113
## 5 PH model_6 TMEX 9.83 2.05 0.0000113
## 6 EN model_6 TMEX 0.367 0.0820 0.0000353
## 7 BL model_6 MI21 -3.11 0.942 0.00161
## 8 BW model_6 MI21 -0.583 0.157 0.000472
## 9 BW model_6 TMEX -0.410 0.159 0.0125
## 10 EBA model_6 MI21 -45.1 11.5 0.000220
## 11 EBA model_6 TMEX -24.4 11.6 0.0400
# Only create visualization if we have treatment effects
if (length(treatment_effects) > 0) {
# Extract contrasts for forest plot
forest_data <- map_dfr(treatment_effects, function(x) {
if (!is.null(x$contrast_summary)) {
x$contrast_summary %>%
as_tibble() %>%
mutate(
phenotype = x$phenotype,
model_used = x$model_used,
# Calculate standardized effect size (Cohen's d approximation)
std_effect = estimate / SE,
# Calculate confidence intervals for standardized effect
ci_lower = (estimate - 1.96 * SE) / SE,
ci_upper = (estimate + 1.96 * SE) / SE
) %>%
select(phenotype, donor, std_effect, ci_lower, ci_upper, p.value)
}
})
if (nrow(forest_data) > 0) {
# Add Bonferroni correction but don't filter yet
forest_data <- forest_data %>%
mutate(p.adj = p.adjust(p.value, method = "bonferroni")) %>%
# Filter to nominally significant results (p < 0.05)
filter(p.value < 0.05) %>%
# Create significance indicator for Bonferroni correction
mutate(bonferroni_sig = p.adj < 0.05)
if (nrow(forest_data) > 0) {
# Create trait labels and donor labels
forest_data <- forest_data %>%
mutate(
trait_label = case_when(
phenotype == "DTA" ~ "Days to Anthesis",
phenotype == "DTS" ~ "Days to Silking",
phenotype == "LAE" ~ "Leaves Above Ear",
phenotype == "PH" ~ "Plant Height",
phenotype == "EN" ~ "Nodes with Ears",
phenotype == "SL" ~ "Sheath Length",
phenotype == "BL" ~ "Blade Length",
phenotype == "BW" ~ "Blade Width",
phenotype == "EBA" ~ "Estimated Blade Area",
TRUE ~ phenotype
),
predictor = paste0("<i>Inv4m</i> from ", donor)
) %>%
# Sort by absolute value of standardized effect size
arrange(desc(abs(std_effect))) %>%
mutate(trait_label = factor(trait_label, levels = unique(trait_label)))
# Define position dodge for overlapping points
pd <- position_dodge(width = 0.3)
# Create forest plot
forest_plot <- forest_data %>%
ggplot(aes(x = std_effect, y = trait_label)) +
xlab("Standardized Effect Size") +
ylab("Trait") +
geom_vline(xintercept = 0, lty = 2) +
geom_point(position = pd, size = 4,
aes(color = bonferroni_sig)) +
geom_errorbar(aes(xmin = ci_lower, xmax = ci_upper),
position = pd, width = 0.2, size = 0.7) +
facet_wrap(. ~ predictor, ncol = 2) +
scale_color_manual(
values = c("TRUE" = "red", "FALSE" = "black"),
labels = c("TRUE" = "Bonferroni significant", "FALSE" = "Nominally significant"),
name = "Significance"
) +
theme_classic2(base_size = 20) +
theme(
strip.background = element_blank(),
strip.text = ggtext::element_markdown(face = "bold", size = 15),
axis.title.y = element_blank(),
axis.text.y = element_text(hjust = 1, color = "black"),
panel.grid.major.y = element_line(color = "grey90"),
plot.caption = element_text(hjust = 0),
legend.position = "bottom"
) +
labs(caption = "All nominally significant effects shown (p < 0.05). Red points survive Bonferroni correction.")
print(forest_plot)
# Print summary tables
cat("\n=== ALL NOMINALLY SIGNIFICANT EFFECTS (p < 0.05) ===\n")
all_sig_table <- forest_data %>%
select(phenotype, donor, std_effect, p.value, p.adj, bonferroni_sig) %>%
arrange(p.value)
print(all_sig_table)
# Show Bonferroni-corrected results separately
bonferroni_sig_effects <- forest_data %>% filter(bonferroni_sig)
if (nrow(bonferroni_sig_effects) > 0) {
cat("\n=== BONFERRONI-CORRECTED SIGNIFICANT EFFECTS (p.adj < 0.05) ===\n")
bonf_table <- bonferroni_sig_effects %>%
select(phenotype, donor, std_effect, p.value, p.adj) %>%
arrange(p.adj)
print(bonf_table)
} else {
cat("\n=== NO EFFECTS SURVIVE BONFERRONI CORRECTION ===\n")
}
} else {
cat("No nominally significant effects (p < 0.05)\n")
}
} else {
cat("No contrast data available for forest plot\n")
}
} else {
cat("Skipping forest plot - no treatment effects available\n")
}
Forest plot of standardized inv4m effects within each donor background
##
## === ALL NOMINALLY SIGNIFICANT EFFECTS (p < 0.05) ===
## # A tibble: 11 × 6
## phenotype donor std_effect p.value p.adj bonferroni_sig
## <chr> <fct> <dbl> <dbl> <dbl> <lgl>
## 1 DTS TMEX 5.93 0.000000172 0.00000310 TRUE
## 2 DTA TMEX 5.52 0.000000784 0.0000141 TRUE
## 3 PH TMEX 4.80 0.0000113 0.000204 TRUE
## 4 EN TMEX 4.48 0.0000353 0.000635 TRUE
## 5 EBA MI21 -3.94 0.000220 0.00397 TRUE
## 6 BW MI21 -3.70 0.000472 0.00849 TRUE
## 7 LAE TMEX -3.42 0.00113 0.0204 TRUE
## 8 BL MI21 -3.31 0.00161 0.0291 TRUE
## 9 BW TMEX -2.58 0.0125 0.226 FALSE
## 10 DTS MI21 -2.41 0.0193 0.347 FALSE
## 11 EBA TMEX -2.10 0.0400 0.719 FALSE
##
## === BONFERRONI-CORRECTED SIGNIFICANT EFFECTS (p.adj < 0.05) ===
## # A tibble: 8 × 5
## phenotype donor std_effect p.value p.adj
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 DTS TMEX 5.93 0.000000172 0.00000310
## 2 DTA TMEX 5.52 0.000000784 0.0000141
## 3 PH TMEX 4.80 0.0000113 0.000204
## 4 EN TMEX 4.48 0.0000353 0.000635
## 5 EBA MI21 -3.94 0.000220 0.00397
## 6 BW MI21 -3.70 0.000472 0.00849
## 7 LAE TMEX -3.42 0.00113 0.0204
## 8 BL MI21 -3.31 0.00161 0.0291
# Only calculate effect sizes if we have treatment effects (GDD phenotypes already excluded)
if (length(treatment_effects) > 0) {
# Calculate standardized effect sizes
effect_sizes <- map_dfr(treatment_effects, function(x) {
if (!is.null(x$contrast_summary)) {
x$contrast_summary %>%
as_tibble() %>%
mutate(
phenotype = x$phenotype,
effect_size = abs(estimate) / SE, # Rough standardized effect size
direction = ifelse(estimate > 0, "INV4M > CTRL", "INV4M < CTRL")
) %>%
select(phenotype, donor, estimate, SE, effect_size, direction, p.value)
}
})
if (nrow(effect_sizes) > 0) {
cat("\n=== EFFECT SIZES AND DIRECTIONS FOR INV4M CONTRASTS ===\n")
print(effect_sizes)
} else {
cat("No effect size data available\n")
}
} else {
cat("Skipping effect size analysis - no treatment effects available\n")
}
##
## === EFFECT SIZES AND DIRECTIONS FOR INV4M CONTRASTS ===
## # A tibble: 18 × 7
## phenotype donor estimate SE effect_size direction p.value
## <chr> <fct> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 DTA MI21 -0.499 0.287 1.74 INV4M < CTRL 0.0870
## 2 DTA TMEX 1.61 0.291 5.52 INV4M > CTRL 0.000000784
## 3 DTS MI21 -0.695 0.289 2.41 INV4M < CTRL 0.0193
## 4 DTS TMEX 1.74 0.294 5.93 INV4M > CTRL 0.000000172
## 5 LAE MI21 0.132 0.0999 1.32 INV4M > CTRL 0.193
## 6 LAE TMEX -0.350 0.102 3.42 INV4M < CTRL 0.00113
## 7 PH MI21 -4.00 2.03 1.97 INV4M < CTRL 0.0533
## 8 PH TMEX 9.83 2.05 4.80 INV4M > CTRL 0.0000113
## 9 EN MI21 0.0592 0.0801 0.739 INV4M > CTRL 0.463
## 10 EN TMEX 0.367 0.0820 4.48 INV4M > CTRL 0.0000353
## 11 SL MI21 0.112 0.0811 1.38 INV4M > CTRL 0.168
## 12 SL TMEX 0.00540 0.0854 0.0632 INV4M > CTRL 0.950
## 13 BL MI21 -3.11 0.942 3.31 INV4M < CTRL 0.00161
## 14 BL TMEX -0.873 0.965 0.905 INV4M < CTRL 0.369
## 15 BW MI21 -0.583 0.157 3.70 INV4M < CTRL 0.000472
## 16 BW TMEX -0.410 0.159 2.58 INV4M < CTRL 0.0125
## 17 EBA MI21 -45.1 11.5 3.94 INV4M < CTRL 0.000220
## 18 EBA TMEX -24.4 11.6 2.10 INV4M < CTRL 0.0400
cat("=== ANALYSIS SUMMARY ===\n\n")
## === ANALYSIS SUMMARY ===
cat("1. MISSING DATA:\n")
## 1. MISSING DATA:
print(missing_summary)
## # A tibble: 9 × 5
## phenotype missing_count total_obs missing_pct available_n
## <chr> <int> <int> <dbl> <int>
## 1 DTA 0 442 0 442
## 2 DTS 0 442 0 442
## 3 LAE 1 442 0.2 441
## 4 PH 0 442 0 442
## 5 EN 0 442 0 442
## 6 SL 47 442 10.6 395
## 7 BL 57 442 12.9 385
## 8 BW 62 442 14 380
## 9 EBA 72 442 16.3 370
cat("\n2. MODEL SELECTION FREQUENCY:\n")
##
## 2. MODEL SELECTION FREQUENCY:
if (exists("model_frequency") && nrow(model_frequency) > 0) {
print(model_frequency)
} else {
cat("No successful model fits available\n")
}
## # A tibble: 9 × 4
## # Groups: phenotype [9]
## phenotype best_model n frequency
## <chr> <chr> <int> <dbl>
## 1 BL model_6 1 11.1
## 2 BW model_6 1 11.1
## 3 DTA model_6 1 11.1
## 4 DTS model_6 1 11.1
## 5 EBA model_6 1 11.1
## 6 EN model_6 1 11.1
## 7 LAE model_6 1 11.1
## 8 PH model_6 1 11.1
## 9 SL model_5 1 11.1
cat("\n3. SIGNIFICANT TREATMENT EFFECTS:\n")
##
## 3. SIGNIFICANT TREATMENT EFFECTS:
if (exists("significant_effects") && nrow(significant_effects) > 0) {
print(significant_effects)
} else {
cat("No significant effects detected at p < 0.05\n")
}
## # A tibble: 11 × 6
## phenotype model_used donor estimate SE p.value
## <chr> <chr> <fct> <dbl> <dbl> <dbl>
## 1 DTA model_6 TMEX 1.61 0.291 0.000000784
## 2 DTS model_6 MI21 -0.695 0.289 0.0193
## 3 DTS model_6 TMEX 1.74 0.294 0.000000172
## 4 LAE model_6 TMEX -0.350 0.102 0.00113
## 5 PH model_6 TMEX 9.83 2.05 0.0000113
## 6 EN model_6 TMEX 0.367 0.0820 0.0000353
## 7 BL model_6 MI21 -3.11 0.942 0.00161
## 8 BW model_6 MI21 -0.583 0.157 0.000472
## 9 BW model_6 TMEX -0.410 0.159 0.0125
## 10 EBA model_6 MI21 -45.1 11.5 0.000220
## 11 EBA model_6 TMEX -24.4 11.6 0.0400
cat("\n4. SPATIAL AUTOCORRELATION:\n")
##
## 4. SPATIAL AUTOCORRELATION:
if (length(variogram_results) > 0) {
cat("Variograms calculated for", length(variogram_results), "phenotypes\n")
cat("All phenotypes showed evidence of spatial structure in their variograms\n")
cat("Scaling revealed relative magnitudes of spatial variance across traits\n")
} else {
cat("No variograms could be calculated\n")
}
## Variograms calculated for 9 phenotypes
## All phenotypes showed evidence of spatial structure in their variograms
## Scaling revealed relative magnitudes of spatial variance across traits
This analysis demonstrates the importance of:
Proper missing data handling: Using complete-case analysis within each phenotype ensures valid model comparisons while maximizing sample sizes.
Systematic model comparison: The hierarchical approach reveals that different phenotypes may require different spatial modeling strategies.
Spatial structure accounting: The variogram analysis and model comparison show that ignoring spatial autocorrelation would lead to biased inference.
Treatment-by-background interactions: The donor × inv4m interaction patterns vary across phenotypes, highlighting the importance of genetic background in phenotypic expression.
Model selection: Use the phenotype-specific best models identified here for final inference.
Multiple testing: Consider adjusting p-values for multiple phenotype comparisons.
Biological interpretation: The spatial patterns and treatment effects should be interpreted in the context of the biological processes underlying each phenotype.
Future studies: The variogram parameters can inform optimal sampling designs for future experiments.