This notebook fits a SpATS PSANOVA surface per
phenotype and returns spatially-corrected plot-level values for
each phenotype. The approach uses statgenHTP::fitModels()
(which wraps SpATS when provided a TimePoints object) and
getCorrected()
to extract corrected plot-level
observations. The workflow:
TimePoints
object for
statgenHTP
.fitModels()
→
getCorrected()
.sp_corrected
.Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.
library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme) # Mixed-effects models (if needed)
library(gstat) # Variograms
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS) # SpATS package (used internally by statgenHTP)
library(statgenHTP) # Provides fitModels() and getCorrected()
# Load the dataset
file_path <- "/Users/fvrodriguez/Desktop/inv4mHighland/data/CLY25_Inv4m.csv"
if (!file.exists(file_path)) {
stop("Error: CLY25_Inv4m.csv not found in the working directory.")
} else {
field_data_raw <- read.csv(file_path, na.strings = c("","#N/A","NA"))
# Clean and prepare plant-level data
field_data <- field_data_raw %>%
# Calculate Estimated Blade Area (EBA) = 0.75 * BL * BW FIRST
mutate(EBA = 0.75 * BL * BW) %>%
rename(
plant_id = plant,
plotId = plot_id,
block = rep,
x = X_pos,
y = Y_pos,
inv4m = inv4m_gt
) %>%
# Convert relevant columns to factors
mutate(across(c(plotId, block, donor, inv4m), as.factor)) %>%
# Add small amount of noise to x coordinates to avoid identical positions
mutate(x = x + runif(n(), min = 0.0, max = 0.01))
# Define phenotypes available in CLY2025
base_phenotypes <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
# Verify all phenotypes exist in the dataset
available_phenotypes <- intersect(base_phenotypes, names(field_data))
all_phenotypes <- available_phenotypes
cat("Plant-level data loaded and cleaned successfully!\n")
cat("Phenotypes available for analysis:", paste(all_phenotypes, collapse = ", "), "\n")
cat("Plant-level data dimensions:", dim(field_data), "\n")
}
## Plant-level data loaded and cleaned successfully!
## Phenotypes available for analysis: DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA
## Plant-level data dimensions: 442 24
head(field_data)
## plant_id row_id plotId plot_row plot_col block y x donor inv4m
## 1 1 4573 1 1 1 3 1.7 1.003038 TMEX INV4M
## 2 2 4573 1 1 1 3 2.3 1.006843 TMEX INV4M
## 3 3 4573 1 1 1 3 3.6 1.004255 TMEX INV4M
## 4 4 4573 1 1 1 3 4.0 1.009143 TMEX INV4M
## 5 5 4573 1 1 1 3 4.5 1.005382 TMEX INV4M
## 6 6 4573 1 1 1 3 5.0 1.008563 TMEX INV4M
## group DOA DOS DTA DTS DTA_GDD DTS_GDD LAE PH EN SL BL BW EBA
## 1 TMEX-INV4M 20 22 77 79 1915 1975 5 192 2 13.5 53.0 8.0 318.0000
## 2 TMEX-INV4M 20 22 77 79 1915 1975 5 209 3 14.0 50.5 7.5 284.0625
## 3 TMEX-INV4M 23 24 80 81 2011 2047 5 208 3 NA NA NA NA
## 4 TMEX-INV4M 23 24 80 81 2011 2047 5 212 3 14.0 53.5 7.5 300.9375
## 5 TMEX-INV4M 21 23 78 80 1943 2011 5 218 3 13.5 56.0 8.0 336.0000
## 6 TMEX-INV4M 22 24 79 81 1975 2047 5 225 3 14.0 64.0 8.5 408.0000
# Aggregate from plant-level to plot-level means
plot_data <- field_data %>%
group_by(plotId, block, plot_row, plot_col, donor, inv4m) %>%
summarise(
across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
n_plants = n(),
.groups = 'drop'
) %>%
# Convert NaN to NA (from mean of all NA values)
mutate(across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x))) %>%
# Rename columns to match PSU2022 structure
rename(# unique plot identifier
Rep = block, # replicate/block
Plot_Row = plot_row, # row position
Plot_Column = plot_col, # column position
) %>%
# Ensure proper data types
mutate(
Plot_Row = as.numeric(Plot_Row),
Plot_Column = as.numeric(Plot_Column),
Rep = as.factor(Rep),
genotype = as.factor(paste(donor,inv4m, sep =".")), # Combine donor and inv4m for genotype
)
cat("Plot-level summary dimensions:", dim(plot_data), "\n")
## Plot-level summary dimensions: 64 17
cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")
## Average plants per plot: 6.9
# Check factor levels
cat("Genotype levels:", levels(plot_data$genotype), "\n")
## Genotype levels: MI21.CTRL MI21.INV4M TMEX.CTRL TMEX.INV4M
cat("Rep levels:", levels(plot_data$Rep), "\n")
## Rep levels: 1 2 3 4
head(plot_data)
## # A tibble: 6 × 17
## plotId Rep Plot_Row Plot_Column donor inv4m DTA DTS LAE PH EN
## <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3 1 1 TMEX INV4M 78.3 79.7 5.14 211. 2.86
## 2 2 3 2 1 MI21 INV4M 76.3 77.6 5.71 176. 2.57
## 3 3 3 3 1 MI21 CTRL 75 75.5 5.75 193 2.75
## 4 4 3 4 1 TMEX CTRL 77.8 78.6 5.5 199. 2.5
## 5 5 1 5 1 MI21 INV4M 75.9 76.8 5.62 179. 2.62
## 6 6 1 6 1 TMEX INV4M 78 78.7 5 205 3
## # ℹ 6 more variables: SL <dbl>, BL <dbl>, BW <dbl>, EBA <dbl>, n_plants <int>,
## # genotype <fct>
missing_summary <- plot_data %>%
select(all_of(all_phenotypes)) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
mutate(
total_obs = nrow(plot_data),
missing_pct = round(missing_count / total_obs * 100, 1),
available_n = total_obs - missing_count
) %>%
arrange(missing_pct)
print(missing_summary)
## # A tibble: 9 × 5
## phenotype missing_count total_obs missing_pct available_n
## <chr> <int> <int> <dbl> <int>
## 1 DTA 0 64 0 64
## 2 DTS 0 64 0 64
## 3 LAE 0 64 0 64
## 4 PH 0 64 0 64
## 5 EN 0 64 0 64
## 6 SL 0 64 0 64
## 7 BL 0 64 0 64
## 8 BW 0 64 0 64
## 9 EBA 0 64 0 64
usable_phenotypes <- missing_summary %>%
filter(missing_pct < 50) %>%
pull(phenotype)
cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")
## Usable phenotypes (<50% missing): DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA
single_time <- plot_data %>%
mutate(
plotId= as.character(plotId), # Ensure plotId is character
Rep = as.character(Rep),
Plot_Row = as.numeric(Plot_Row),
Plot_Column = as.numeric(Plot_Column)
) %>%
mutate(time = as.POSIXct("2025-01-15 10:00:00")) # Use CLY2025 date
tp_data <- createTimePoints(
dat = single_time,
experimentName = "CLY2025",
genotype="genotype",
timePoint = "time",
plotId = "plotId", # unique ID per plot
repId = "Rep",
rowNum = "Plot_Row",
colNum = "Plot_Column",
addCheck = FALSE
)
summary(tp_data)
## tp_data contains data for experiment CLY2025.
##
## It contains 1 time points.
## First time point: 2025-01-15 10:00:00
## Last time point: 2025-01-15 10:00:00
##
## No check genotypes are defined.
# Initialize lists
corrected_list <- list()
fit_info <- list()
all_models <- list()
for (phen in usable_phenotypes) {
message("Fitting SpATS model for: ", phen)
# Check if we have enough non-missing observations
no_nas <- single_time %>%
filter(!is.na(.data[[phen]]))
if (nrow(no_nas) < 10) {
message("Insufficient data for ", phen, " (n=", nrow(no_nas), ")")
next
}
# Fit model
fit_try <- try(
fitModels(
TP = tp_data,
trait = phen,
extraFixedFactors = c("repId"),
timePoints = 1,
what = "fixed"
),
silent = TRUE
)
if (inherits(fit_try, "try-error")) {
message("Fit failed for ", phen, ": ", fit_try)
next
}
all_models[[phen]] <- fit_try
# Extract corrected values
corr <- try(getCorrected(fit_try), silent = TRUE)
if (inherits(corr, "try-error") || is.null(corr)) {
message("getCorrected() failed for ", phen)
next
}
corr <- corr %>%
select(-phen)
corrected_list[[phen]] <- corr
fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}
# Combine all corrected data into a single data frame
if (length(corrected_list) > 0) {
sp_corrected <- lapply(
corrected_list,
function(corr_df) {
corr_df %>%
select(timeNumber, timePoint, genotype, repId, rowId, colId, plotId)
}
) %>%
bind_rows() %>%
arrange(plotId) %>%
distinct()
# Join each corrected phenotype
for (phen in names(corrected_list)) {
phen_corr <- paste0(phen, "_corr")
corr_df <- corrected_list[[phen]] %>%
select(plotId, all_of(phen_corr))
sp_corrected <- left_join(
sp_corrected,
corr_df,
by = "plotId"
)
}
# Clean column names (remove _corr suffix)
colnames(sp_corrected) <- gsub("_corr", "", colnames(sp_corrected))
sp_corrected <- sp_corrected %>%
separate(genotype, c("donor","genotype"))
# Print dimensions and preview
cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
print(head(sp_corrected))
} else {
message("No models were successfully fitted.")
}
trait_names <- c("DTS","PH") # Example traits to visualize
for (trait in trait_names) {
cat("\n### Spatial plots for", trait, "\n")
# Model diagnostic plots
plot(all_models[[trait]], timePoints = 1)
# Spatial trend plots
plot(all_models[[trait]],
timePoints = 1,
plotType = "spatial",
spaTrend = "raw")
}
##
## ### Spatial plots for DTS
##
## ### Spatial plots for PH
if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
# Get corrected phenotype names (exclude metadata columns)
corrected_phenotypes <- intersect(usable_phenotypes, colnames(sp_corrected))
if (length(corrected_phenotypes) > 0) {
# Define comparison groups
comparison_genotypes <- unique(sp_corrected$genotype)
cat("Available genotypes:", paste(comparison_genotypes, collapse = ", "), "\n")
# Get unique donors
donors <- unique(sp_corrected$donor)
cat("Available donors:", paste(donors, collapse = ", "), "\n")
# T-test comparing genotypes with FDR correction (per donor)
htest <- sp_corrected %>%
pivot_longer(
cols = all_of(corrected_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
filter(!is.na(value)) %>%
group_by(donor, trait) %>%
# Only perform t-test if we have exactly 2 genotype levels
filter(n_distinct(genotype) == 2) %>%
t_test(value ~ genotype) %>%
adjust_pvalue(method = "fdr") %>%
add_y_position(scales = "free") %>%
add_significance()
print(htest)
# Plot theme
pheno_theme2 <- ggpubr::theme_classic2(base_size = 20) +
ggplot2::theme(
plot.title = ggtext::element_markdown(hjust = 0.5, face = "bold"),
axis.title.y = ggtext::element_markdown(),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 25, face = "bold", color = "black"),
strip.background = element_blank(),
strip.text = ggtext::element_markdown(size = 20),
legend.position = "none"
)
# Main loop: iterate through each donor
for (current_donor in donors) {
cat("\n=== Processing donor:", current_donor, "===\n")
# Donor-specific filtering: get significant traits for this donor only
alpha <- 0.05
sig_traits_donor <- htest %>%
filter(donor == current_donor, p.adj < alpha) %>%
arrange(p.adj) %>%
pull(trait)
cat("Significant traits for", current_donor, "(FDR < 0.05):",
paste(sig_traits_donor, collapse = ", "), "\n")
if (length(sig_traits_donor) == 0) {
cat("No traits significant after FDR correction for donor", current_donor, "\n")
next
}
# Data filtering: prepare data for this donor and significant traits only
plot_data_donor <- sp_corrected %>%
filter(donor == current_donor) %>%
pivot_longer(
cols = all_of(corrected_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
filter(trait %in% sig_traits_donor, !is.na(value)) %>%
mutate(trait = as.character(trait))
# Statistical tests: get p-values for this donor's significant traits
test_to_plot_donor <- htest %>%
filter(donor == current_donor, trait %in% sig_traits_donor)
# Plot customization: adaptive faceting based on number of traits
n_traits <- length(sig_traits_donor)
ncol_facets <- min(4, n_traits)
# Create donor-specific plot with custom title
diff_plot <- ggplot(plot_data_donor, aes(x = genotype, y = value, color = genotype)) +
ggplot2::geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) +
ggbeeswarm::geom_quasirandom(size = 2) +
scale_color_manual(values = c("CTRL" = "gold", "INV4M" = "purple4")) +
labs(
title = paste("CLY25: ", current_donor, " Spatially Corrected (FDR-Adjusted)"),
y = "Corrected Value",
x = "Genotype"
) +
stat_pvalue_manual(
test_to_plot_donor,
tip.length = 0.01,
step.height = 0.08,
size = 10,
bracket.size = 0.8
) +
scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
facet_wrap(~ factor(trait, levels = sig_traits_donor),
scales = "free_y",
ncol = ncol_facets) +
pheno_theme2
print(diff_plot)
# Print summary statistics for this donor
cat("\n--- Summary for", current_donor, "---\n")
summary_stats <- plot_data_donor %>%
group_by(trait, genotype) %>%
summarise(
n = n(),
mean = round(mean(value, na.rm = TRUE), 3),
sd = round(sd(value, na.rm = TRUE), 3),
.groups = "drop"
) %>%
pivot_wider(
names_from = genotype,
values_from = c(n, mean, sd),
names_sep = "_"
)
print(summary_stats)
}
# Overall summary across all donors
cat("\n=== Overall Summary Across All Donors ===\n")
overall_sig <- htest %>%
filter(p.adj < alpha) %>%
group_by(trait) %>%
summarise(
n_donors_significant = n(),
min_pvalue = min(p.adj),
max_pvalue = max(p.adj),
.groups = "drop"
) %>%
arrange(desc(n_donors_significant), min_pvalue)
print(overall_sig)
} else {
message("No corrected phenotypes available for analysis")
}
} else {
message("No spatially corrected data available")
}
## Available genotypes: INV4M, CTRL
## Available donors: TMEX, MI21
## # A tibble: 18 × 14
## donor trait .y. group1 group2 n1 n2 statistic df p p.adj
## <chr> <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 MI21 BL value CTRL INV4M 16 16 5.95 29.9 1.63e-6 7.33e-6
## 2 MI21 BW value CTRL INV4M 16 16 5.64 30.0 3.86e-6 1.39e-5
## 3 MI21 DTA value CTRL INV4M 16 16 2.19 29.7 3.62e-2 4.98e-2
## 4 MI21 DTS value CTRL INV4M 16 16 3.16 30.0 3.57e-3 6.43e-3
## 5 MI21 EBA value CTRL INV4M 16 16 6.97 29.6 1.04e-7 6.24e-7
## 6 MI21 EN value CTRL INV4M 16 16 -0.903 29.8 3.74e-1 3.96e-1
## 7 MI21 LAE value CTRL INV4M 16 16 -2.16 29.7 3.87e-2 4.98e-2
## 8 MI21 PH value CTRL INV4M 16 16 3.62 29.7 1.09e-3 2.18e-3
## 9 MI21 SL value CTRL INV4M 16 16 -0.988 27.8 3.32e-1 3.74e-1
## 10 TMEX BL value CTRL INV4M 16 16 1.17 29.9 2.52e-1 3.02e-1
## 11 TMEX BW value CTRL INV4M 16 16 2.88 26.7 7.79e-3 1.27e-2
## 12 TMEX DTA value CTRL INV4M 16 16 -8.05 29.3 6.61e-9 1.19e-7
## 13 TMEX DTS value CTRL INV4M 16 16 -7.86 26.2 2.3 e-8 2.07e-7
## 14 TMEX EBA value CTRL INV4M 16 16 2.62 29.5 1.37e-2 2.06e-2
## 15 TMEX EN value CTRL INV4M 16 16 -4.25 27.9 2.15e-4 4.84e-4
## 16 TMEX LAE value CTRL INV4M 16 16 4.54 29.1 8.97e-5 2.31e-4
## 17 TMEX PH value CTRL INV4M 16 16 -4.70 25.5 7.66e-5 2.30e-4
## 18 TMEX SL value CTRL INV4M 16 16 0.214 30.0 8.32e-1 8.32e-1
## # ℹ 3 more variables: y.position <dbl>, groups <named list>, p.adj.signif <chr>
##
## === Processing donor: TMEX ===
## Significant traits for TMEX (FDR < 0.05): DTA, DTS, PH, LAE, EN, BW, EBA
##
## --- Summary for TMEX ---
## # A tibble: 7 × 7
## trait n_CTRL n_INV4M mean_CTRL mean_INV4M sd_CTRL sd_INV4M
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 BW 16 16 8.62 8.19 0.336 0.485
## 2 DTA 16 16 77.7 79.3 0.545 0.639
## 3 DTS 16 16 78.8 80.6 0.503 0.75
## 4 EBA 16 16 393. 368. 24.5 27.8
## 5 EN 16 16 2.20 2.53 0.183 0.244
## 6 LAE 16 16 6.03 5.73 0.203 0.171
## 7 PH 16 16 194. 204. 4.74 7.40
##
## === Processing donor: MI21 ===
## Significant traits for MI21 (FDR < 0.05): EBA, BL, BW, PH, DTS, DTA, LAE
##
## --- Summary for MI21 ---
## # A tibble: 7 × 7
## trait n_CTRL n_INV4M mean_CTRL mean_INV4M sd_CTRL sd_INV4M
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 BL 16 16 58.3 55.3 1.49 1.39
## 2 BW 16 16 8.78 8.14 0.32 0.321
## 3 DTA 16 16 77.7 77.3 0.615 0.552
## 4 DTS 16 16 78.4 77.6 0.638 0.66
## 5 EBA 16 16 386. 338. 20.6 18.4
## 6 LAE 16 16 5.89 6.02 0.157 0.172
## 7 PH 16 16 187. 183. 3.50 3.86
##
## === Overall Summary Across All Donors ===
## # A tibble: 8 × 4
## trait n_donors_significant min_pvalue max_pvalue
## <chr> <int> <dbl> <dbl>
## 1 DTA 2 0.000000119 0.0498
## 2 DTS 2 0.000000207 0.00643
## 3 EBA 2 0.000000624 0.0206
## 4 BW 2 0.0000139 0.0127
## 5 PH 2 0.000230 0.00218
## 6 LAE 2 0.000231 0.0498
## 7 BL 1 0.00000734 0.00000734
## 8 EN 1 0.000484 0.000484
# Print summary of analysis
cat("\n=== Analysis Summary ===\n")
##
## === Analysis Summary ===
cat("Total phenotypes evaluated:", length(all_phenotypes), "\n")
## Total phenotypes evaluated: 9
cat("Usable phenotypes (<50% missing):", length(usable_phenotypes), "\n")
## Usable phenotypes (<50% missing): 9
cat("Successfully fitted models:", length(all_models), "\n")
## Successfully fitted models: 9
if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
cat("Plots with corrected data:", nrow(sp_corrected), "\n")
# Export corrected data
output_file <- "CLY2025_spatially_corrected_phenotypes.csv"
write.csv(sp_corrected, output_file, row.names = FALSE)
cat("Spatially corrected data exported to:", output_file, "\n")
} else {
cat("No corrected data to export\n")
}
## Plots with corrected data: 64
## Spatially corrected data exported to: CLY2025_spatially_corrected_phenotypes.csv
# Print model fit information
if (length(fit_info) > 0) {
cat("\n=== Model Fit Details ===\n")
for (trait in names(fit_info)) {
info <- fit_info[[trait]]
cat(sprintf("%s: n=%d, status=%s\n", trait, info$n_obs, info$status))
}
}
##
## === Model Fit Details ===
## DTA: n=64, status=success
## DTS: n=64, status=success
## LAE: n=64, status=success
## PH: n=64, status=success
## EN: n=64, status=success
## SL: n=64, status=success
## BL: n=64, status=success
## BW: n=64, status=success
## EBA: n=64, status=success