This notebook fits a SpATS PSANOVA surface per
phenotype and returns spatially-corrected plot-level values for
each phenotype, including GDD-based traits (GDDA and GDDS). The approach
uses statgenHTP::fitModels() (which wraps SpATS when
provided a TimePoints object) and getCorrected() to extract
corrected plot-level observations. The workflow:
plot_data (plot-level means) and merge GDD
values.TimePoints object for
statgenHTP.fitModels() → getCorrected().sp_corrected.Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.
library(tidyverse)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme)
library(gstat)
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS)
library(statgenHTP)
library(ggbeeswarm)
environment_name <- "PSU2022"
planting_date <- mdy("5/26/2022")
gdd_lookup_file <- "gdd_lookup_table.csv"
gdd_at_phenology <- read_csv(gdd_lookup_file) %>%
filter(environment == environment_name)
cat("Loaded GDD lookup for", environment_name, "with",
nrow(gdd_at_phenology), "dates\n")
## Loaded GDD lookup for PSU2022 with 127 dates
cat("GDD range:", min(gdd_at_phenology$cumulative_gdd_from_planting), "to",
max(gdd_at_phenology$cumulative_gdd_from_planting), "°C\n")
## GDD range: 6.28 to 1412.355 °C
DATA_DIR <- "/Users/fvrodriguez/Desktop/inv4mHighland/data"
plant_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field.csv")
ear_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field_ear_pheno.csv")
field_data_raw <- read.csv(plant_csv, na.strings = c("", "#N/A", "NA"))
ear_data_raw <- read.csv(ear_csv, na.strings = c("", "n/a", "NA"), skip = 1)
field_data <- field_data_raw %>%
filter(P22. >= 3004, P22. <= 4192) %>%
rename(
rowid = P22.,
Genotype = Who.What,
PH = Height_Anthesis,
STW40 = X40_DAP_dw,
STW50 = X50_DAP_dw,
STW60 = X60_DAP_dw,
STWHV = harvest_dw
) %>%
mutate(
Treatment = factor(Treatment, levels = c("HighP", "LowP")),
Genotype = factor(Genotype),
Rep = as.factor(Rep)
) %>%
mutate(
Treatment = fct_recode(Treatment, "+P" = "HighP", "-P" = "LowP"),
Genotype = fct_recode(Genotype, "Inv4m" = "INV4M")
)
ear_data <- ear_data_raw %>%
select(-description, -RK, -CC, -NIR, ear_rep = rep) %>%
rename(rowid = row) %>%
arrange(rowid) %>%
group_by(rowid) %>%
select(-ear_rep) %>%
summarise_all(mean, na.rm = TRUE) %>%
droplevels()
field_data_complete <- field_data %>%
left_join(ear_data, by = "rowid") %>%
mutate(HI = TKW / STWHV) %>%
mutate(
Plot_Column = case_when(
Plot == "PlotVIII" ~ Plot_Column + 10,
Plot == "PlotVI" ~ Plot_Column,
TRUE ~ Plot_Column
)
) %>%
filter(!is.na(PH)) %>%
droplevels()
base_phenotypes <- c("PH", "DTA", "DTS", "STW40", "STW50", "STW60", "STWHV")
ear_phenotypes <- c("EW", "EL", "ED", "KRN", "CD", "TKW", "KW50", "TKN", "HI")
all_phenotypes <- unique(c(base_phenotypes, ear_phenotypes))
cat("Field data dimensions:", dim(field_data_complete), "\n")
## Field data dimensions: 127 39
cat("Phenotypes available:", paste(all_phenotypes, collapse = ", "), "\n")
## Phenotypes available: PH, DTA, DTS, STW40, STW50, STW60, STWHV, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI
plot_data <- field_data_complete %>%
group_by(rowid, Rep, Plot_Row, Plot_Column, Genotype, Treatment) %>%
summarise(
across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
n_plants = n(),
.groups = 'drop'
) %>%
mutate(
across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x)),
environment = environment_name,
anthesis_date = planting_date + days(round(DTA)),
silking_date = planting_date + days(round(DTS))
) %>%
left_join(
gdd_at_phenology %>%
rename(anthesis_date = date, GDDA = cumulative_gdd_from_planting),
by = c("environment", "anthesis_date")
) %>%
left_join(
gdd_at_phenology %>%
rename(silking_date = date, GDDS = cumulative_gdd_from_planting),
by = c("environment", "silking_date")
) %>%
select(-environment, -anthesis_date, -silking_date)
plot_data$Plot_Row <- as.numeric(plot_data$Plot_Row)
plot_data$Plot_Column <- as.numeric(plot_data$Plot_Column)
plot_data$Rep <- as.factor(plot_data$Rep)
cat("Plot-level summary dimensions:", dim(plot_data), "\n")
## Plot-level summary dimensions: 127 25
cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")
## Average plants per plot: 1
cat("GDD columns added: GDDA (anthesis), GDDS (silking)\n")
## GDD columns added: GDDA (anthesis), GDDS (silking)
cat("Plots with GDDA:", sum(!is.na(plot_data$GDDA)), "\n")
## Plots with GDDA: 127
cat("Plots with GDDS:", sum(!is.na(plot_data$GDDS)), "\n")
## Plots with GDDS: 127
all_traits <- c(all_phenotypes, "GDDA", "GDDS")
missing_summary <- plot_data %>%
select(all_of(all_traits)) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
mutate(
total_obs = nrow(plot_data),
missing_pct = round(missing_count / total_obs * 100, 1),
available_n = total_obs - missing_count
) %>%
arrange(missing_pct)
print(missing_summary)
## # A tibble: 18 × 5
## phenotype missing_count total_obs missing_pct available_n
## <chr> <int> <int> <dbl> <int>
## 1 PH 0 127 0 127
## 2 DTA 0 127 0 127
## 3 DTS 0 127 0 127
## 4 GDDA 0 127 0 127
## 5 GDDS 0 127 0 127
## 6 STW40 2 127 1.6 125
## 7 STWHV 2 127 1.6 125
## 8 STW50 15 127 11.8 112
## 9 STW60 20 127 15.7 107
## 10 EW 22 127 17.3 105
## 11 EL 22 127 17.3 105
## 12 ED 22 127 17.3 105
## 13 KRN 22 127 17.3 105
## 14 CD 22 127 17.3 105
## 15 TKW 22 127 17.3 105
## 16 KW50 22 127 17.3 105
## 17 TKN 22 127 17.3 105
## 18 HI 22 127 17.3 105
usable_phenotypes <- missing_summary %>%
filter(missing_pct < 50) %>%
pull(phenotype)
cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")
## Usable phenotypes (<50% missing): PH, DTA, DTS, GDDA, GDDS, STW40, STWHV, STW50, STW60, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI
single_time <- plot_data %>%
mutate(
Genotype = as.character(Genotype),
Treatment = as.character(Treatment),
Rep = as.character(Rep),
Plot_Row = as.numeric(Plot_Row),
Plot_Column = as.numeric(Plot_Column),
time = as.POSIXct("2022-05-22 10:00:00")
)
tp_data <- createTimePoints(
dat = single_time,
experimentName = environment_name,
genotype = "Genotype",
timePoint = "time",
plotId = "rowid",
repId = "Rep",
rowNum = "Plot_Row",
colNum = "Plot_Column",
checkGenotypes = c("NUE", "B73"),
addCheck = TRUE
)
summary(tp_data)
## tp_data contains data for experiment PSU2022.
##
## It contains 1 time points.
## First time point: 2022-05-22 10:00:00
## Last time point: 2022-05-22 10:00:00
##
## The following genotypes are defined as check genotypes: NUE, B73.
corrected_list <- list()
fit_info <- list()
all_models <- list()
for (phen in usable_phenotypes) {
message("Fitting SpATS model for: ", phen)
no_nas <- single_time %>%
filter(!is.na(.data[[phen]]))
if (nrow(no_nas) < 10) {
message("Insufficient data for ", phen, " (n=", nrow(no_nas), ")")
next
}
fit_try <- try(
fitModels(
TP = tp_data,
trait = phen,
extraFixedFactors = c("repId", "Treatment"),
timePoints = 1,
what = "fixed"
),
silent = TRUE
)
if (inherits(fit_try, "try-error")) {
message("Fit failed for ", phen, ": ", fit_try)
next
}
all_models[[phen]] <- fit_try
corr <- try(getCorrected(fit_try), silent = TRUE)
if (inherits(corr, "try-error") || is.null(corr)) {
message("getCorrected() failed for ", phen)
next
}
corr <- corr %>%
select(-phen)
corrected_list[[phen]] <- corr
fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}
sp_corrected <- lapply(
corrected_list,
function(corr_df) {
corr_df %>%
select(timeNumber, timePoint, genotype, repId, Treatment,
rowId, colId, plotId)
}
) %>%
bind_rows() %>%
arrange(plotId) %>%
distinct()
for (phen in names(corrected_list)) {
phen_corr <- paste0(phen, "_corr")
corr_df <- corrected_list[[phen]] %>%
select(plotId, all_of(phen_corr))
sp_corrected <- left_join(sp_corrected, corr_df, by = "plotId")
}
colnames(sp_corrected) <- gsub("_corr", "", colnames(sp_corrected))
cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
cat("Corrected traits:",
paste(intersect(usable_phenotypes, colnames(sp_corrected)),
collapse = ", "), "\n")
print(head(sp_corrected))
if ("PH" %in% names(all_models)) {
plot(all_models$PH, timePoints = 1)
plot(all_models$PH, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}
if ("HI" %in% names(all_models)) {
plot(all_models$HI, timePoints = 1)
plot(all_models$HI, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}
if ("GDDA" %in% names(all_models)) {
cat("\n### Spatial plots for GDDA\n")
plot(all_models$GDDA, timePoints = 1)
plot(all_models$GDDA, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}
##
## ### Spatial plots for GDDA
if ("GDDS" %in% names(all_models)) {
cat("\n### Spatial plots for GDDS\n")
plot(all_models$GDDS, timePoints = 1)
plot(all_models$GDDS, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}
##
## ### Spatial plots for GDDS
NIL_corrected <- sp_corrected %>%
filter(genotype %in% c("Inv4m", "CTRL")) %>%
droplevels()
htest <- NIL_corrected %>%
pivot_longer(
cols = all_of(usable_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
filter(!is.na(value)) %>%
group_by(trait) %>%
t_test(value ~ genotype) %>%
adjust_pvalue(method = "fdr") %>%
add_y_position(scales = "free") %>%
add_significance() %>%
arrange(p.adj)
print(htest)
## # A tibble: 18 × 13
## trait .y. group1 group2 n1 n2 statistic df p p.adj
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 PH value CTRL Inv4m 32 32 -5.82 61.3 0.000000232 0.00000418
## 2 DTA value CTRL Inv4m 32 32 4.89 60.4 0.00000793 0.0000714
## 3 DTS value CTRL Inv4m 32 32 4.73 58.9 0.0000143 0.0000738
## 4 GDDA value CTRL Inv4m 32 32 4.69 60.1 0.0000164 0.0000738
## 5 GDDS value CTRL Inv4m 32 32 4.60 55.7 0.0000247 0.0000889
## 6 HI value CTRL Inv4m 26 27 -2.65 48.6 0.0107 0.0321
## 7 CD value CTRL Inv4m 26 27 2.41 35.6 0.0211 0.0475
## 8 STWHV value CTRL Inv4m 30 32 2.41 58.6 0.0191 0.0475
## 9 TKW value CTRL Inv4m 26 27 -2.12 43.6 0.0396 0.0792
## 10 KRN value CTRL Inv4m 26 27 2.00 43.7 0.0515 0.0854
## 11 STW40 value CTRL Inv4m 31 32 1.98 59.9 0.0522 0.0854
## 12 EW value CTRL Inv4m 26 27 -1.95 43.6 0.0582 0.0873
## 13 ED value CTRL Inv4m 26 27 1.74 34.0 0.0913 0.126
## 14 STW50 value CTRL Inv4m 28 29 1.57 51.0 0.123 0.158
## 15 EL value CTRL Inv4m 26 27 -1.31 48.6 0.195 0.234
## 16 TKN value CTRL Inv4m 26 27 -1.27 42.8 0.21 0.236
## 17 STW60 value CTRL Inv4m 26 28 -0.549 48.7 0.586 0.620
## 18 KW50 value CTRL Inv4m 26 27 -0.332 37.4 0.742 0.742
## # ℹ 3 more variables: y.position <dbl>, groups <named list>, p.adj.signif <chr>
alpha <- 0.05
sig_traits <- htest %>%
filter(p.adj < alpha) %>%
arrange(p.adj) %>%
pull(trait)
cat("Significant traits (FDR < 0.05):", paste(sig_traits, collapse = ", "), "\n")
## Significant traits (FDR < 0.05): PH, DTA, DTS, GDDA, GDDS, HI, CD, STWHV
pheno_theme2 <- theme_classic2(base_size = 20) +
theme(
plot.title = element_markdown(hjust = 0.5, face = "bold"),
axis.title.y = element_markdown(),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 25, face = "bold", color = "black"),
strip.background = element_blank(),
strip.text = element_markdown(size = 20),
legend.position = "none"
)
if (length(sig_traits) == 0) {
warning("No traits significant after FDR correction.")
} else {
plot_data_sig <- NIL_corrected %>%
pivot_longer(
cols = all_of(usable_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
filter(trait %in% sig_traits, !is.na(value)) %>%
mutate(trait = as.character(trait))
test_to_plot <- htest %>%
filter(trait %in% sig_traits)
diff_plot <- ggplot(plot_data_sig, aes(x = genotype, y = value, color = genotype)) +
geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
geom_quasirandom(size = 2) %>%
with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
labs(
title = "Inv4m vs CTRL: Spatially Corrected Phenotypes (FDR-Adjusted)",
y = "Corrected Value",
x = "Genotype"
) +
stat_pvalue_manual(
test_to_plot,
tip.length = 0.01,
step.height = 0.08,
size = 10,
bracket.size = 0.8
) +
scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
facet_wrap(~ factor(trait, levels = sig_traits), scales = "free_y", ncol = 4) +
pheno_theme2
print(diff_plot)
}
cat("\n=== Analysis Summary ===\n")
##
## === Analysis Summary ===
cat("Total phenotypes evaluated:", length(all_traits), "\n")
## Total phenotypes evaluated: 18
cat("Usable phenotypes (<50% missing):", length(usable_phenotypes), "\n")
## Usable phenotypes (<50% missing): 18
cat("Successfully fitted models:", length(all_models), "\n")
## Successfully fitted models: 18
if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
cat("Plots with corrected data:", nrow(sp_corrected), "\n")
output_file <- paste0(environment_name, "_spatially_corrected_phenotypes.csv")
write_csv(sp_corrected, output_file)
cat("Spatially corrected data exported to:", output_file, "\n")
} else {
cat("No corrected data to export\n")
}
## Plots with corrected data: 127
## Spatially corrected data exported to: PSU2022_spatially_corrected_phenotypes.csv
if (length(fit_info) > 0) {
cat("\n=== Model Fit Details ===\n")
for (trait in names(fit_info)) {
info <- fit_info[[trait]]
cat(sprintf("%s: n=%d, status=%s\n", trait, info$n_obs, info$status))
}
}
##
## === Model Fit Details ===
## PH: n=127, status=success
## DTA: n=127, status=success
## DTS: n=127, status=success
## GDDA: n=127, status=success
## GDDS: n=127, status=success
## STW40: n=125, status=success
## STWHV: n=125, status=success
## STW50: n=112, status=success
## STW60: n=107, status=success
## EW: n=105, status=success
## EL: n=105, status=success
## ED: n=105, status=success
## KRN: n=105, status=success
## CD: n=105, status=success
## TKW: n=105, status=success
## KW50: n=105, status=success
## TKN: n=105, status=success
## HI: n=105, status=success