This notebook fits a SpATS PSANOVA surface per
phenotype and returns spatially-corrected plot-level values for
each phenotype. The approach uses statgenHTP::fitModels()
(which wraps SpATS when provided a TimePoints object) and
getCorrected()
to extract corrected plot-level
observations. The workflow:
plot_data
(plot-level means).TimePoints
object for
statgenHTP
.fitModels()
→
getCorrected()
.sp_corrected
.Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.
library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme) # Mixed-effects models (if needed)
library(gstat) # Variograms
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS) # SpATS package (used internally by statgenHTP)
library(statgenHTP) # Provides fitModels() and getCorrected()
DATA_DIR <- "/Users/fvrodriguez/Desktop/inv4mHighland/data"
plant_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field.csv")
ear_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field_ear_pheno.csv")
field_data_raw <- read.csv(plant_csv, na.strings = c("", "#N/A", "NA"))
ear_data_raw <- read.csv(ear_csv, na.strings = c("", "n/a", "NA"), skip = 1)
field_data <- field_data_raw %>%
filter(P22. >= 3004, P22. <= 4192) %>%
rename(
rowid = P22.,
Genotype = Who.What,
PH = Height_Anthesis,
STW40 = X40_DAP_dw,
STW50 = X50_DAP_dw,
STW60 = X60_DAP_dw,
STWHV = harvest_dw
) %>%
filter(Genotype %in% c("CTRL", "INV4M")) %>%
mutate(
Treatment = factor(Treatment, levels = c("HighP", "LowP")),
Genotype = factor(Genotype, levels = c("CTRL", "INV4M")),
Rep = as.factor(Rep)
) %>%
mutate(
Treatment = fct_recode(Treatment, "+P" = "HighP", "-P" = "LowP"),
Genotype = fct_recode(Genotype, "Inv4m" = "INV4M")
)
ear_data <- ear_data_raw %>%
select(-description, -RK, -CC, -NIR, ear_rep = rep) %>%
rename(rowid = row) %>%
arrange(rowid) %>%
group_by(rowid) %>%
select(-ear_rep) %>%
summarise_all(mean, na.rm = TRUE) %>%
droplevels()
field_data_complete <- field_data %>%
left_join(ear_data, by = "rowid") %>%
mutate(HI = TKW / STWHV) %>%
mutate(
Plot_Column = case_when(
Plot == "PlotVIII" ~ Plot_Column + 10,
Plot == "PlotVI" ~ Plot_Column,
TRUE ~ Plot_Column
)
) %>%
filter(!is.na(PH)) %>%
droplevels()
# Define phenotypes
base_phenotypes <- c("PH", "DTA", "DTS", "STW40", "STW50", "STW60", "STWHV")
ear_phenotypes <- c("EW", "EL", "ED", "KRN", "CD", "TKW", "KW50", "TKN", "HI")
all_phenotypes <- unique(c(base_phenotypes, ear_phenotypes))
plot_data <- field_data_complete %>%
group_by(rowid, Rep, Plot_Row, Plot_Column, Genotype, Treatment) %>%
summarise(
across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
n_plants = n(),
.groups = 'drop'
) %>%
mutate(across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x)))
plot_data$Plot_Row <- as.numeric(plot_data$Plot_Row)
plot_data$Plot_Column <- as.numeric(plot_data$Plot_Column)
plot_data$Rep <- as.factor(plot_data$Rep)
cat("Plot-level summary dimensions:", dim(plot_data), "\n")
## Plot-level summary dimensions: 64 23
cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")
## Average plants per plot: 1
missing_summary <- plot_data %>%
select(all_of(all_phenotypes)) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
mutate(
total_obs = nrow(plot_data),
missing_pct = round(missing_count / total_obs * 100, 1),
available_n = total_obs - missing_count
) %>%
arrange(missing_pct)
print(missing_summary)
## # A tibble: 16 × 5
## phenotype missing_count total_obs missing_pct available_n
## <chr> <int> <int> <dbl> <int>
## 1 PH 0 64 0 64
## 2 DTA 0 64 0 64
## 3 DTS 0 64 0 64
## 4 STW40 1 64 1.6 63
## 5 STWHV 2 64 3.1 62
## 6 STW50 7 64 10.9 57
## 7 STW60 10 64 15.6 54
## 8 EW 11 64 17.2 53
## 9 EL 11 64 17.2 53
## 10 ED 11 64 17.2 53
## 11 KRN 11 64 17.2 53
## 12 CD 11 64 17.2 53
## 13 TKW 11 64 17.2 53
## 14 KW50 11 64 17.2 53
## 15 TKN 11 64 17.2 53
## 16 HI 11 64 17.2 53
usable_phenotypes <- missing_summary %>%
filter(missing_pct < 50) %>%
pull(phenotype)
cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")
## Usable phenotypes (<50% missing): PH, DTA, DTS, STW40, STWHV, STW50, STW60, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI
single_time <- plot_data %>%
mutate(
Genotype = as.character(Genotype), # statgenHTP expects character genotype IDs
Treatment = as.character(Treatment),
Rep = as.character(Rep),
Plot_Row = as.numeric(Plot_Row),
Plot_Column = as.numeric(Plot_Column)
) %>%
mutate(time = as.POSIXct("2022-09-15 10:00:00"))
tp_data <- createTimePoints(
dat = single_time,
experimentName = "PSU2022",
genotype = "Genotype",
timePoint = "time",
plotId = "rowid", # unique ID per plot
repId = "Rep",
rowNum = "Plot_Row",
colNum = "Plot_Column",
addCheck = FALSE
)
summary(tp_data)
## tp_data contains data for experiment PSU2022.
##
## It contains 1 time points.
## First time point: 2022-09-15 10:00:00
## Last time point: 2022-09-15 10:00:00
##
## No check genotypes are defined.
## 5. Fit SpATS models and extract corrected phenotypes
# Initialize lists
corrected_list <- list()
fit_info <- list()
all_models <- list()
for (phen in usable_phenotypes) {
message("Fitting SpATS model for: ", phen)
# Use single_time (has all phenotypes)
no_nas <- single_time %>%
filter(!is.na(.data[[phen]]))
# Fit model
fit_try <- try(
fitModels(
TP = tp_data,
trait = phen,
extraFixedFactors = c("repId", "Treatment"),
timePoints = 1,
what = "fixed"
),
silent = TRUE
)
if (inherits(fit_try, "try-error")) {
message("Fit failed for ", phen, ": ", fit_try)
next
}
all_models[[phen]] <- fit_try
# Extract corrected values
corr <- try(getCorrected(fit_try), silent = TRUE)
if (inherits(corr, "try-error") || is.null(corr)) {
message("getCorrected() failed for ", phen)
next
}
corr <- corr %>%
select(-phen)
corrected_list[[phen]] <- corr
fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}
# Combine all corrected data into a single data frame
sp_corrected <- lapply(
corrected_list,
function(corr_df) {
corr_df %>%
select(timeNumber,timePoint,genotype,repId,Treatment,rowId,colId,plotId)
}
) %>%
bind_rows() %>%
arrange(plotId) %>%
distinct()
sp_corrected
# Join each corrected phenotype
for (phen in names(corrected_list)) {
phen_corr <- paste0(phen, "_corr")
corr_df <- corrected_list[[phen]] %>%
select(plotId,all_of(phen_corr))
sp_corrected <- left_join(
sp_corrected,
corr_df
)
}
colnames(sp_corrected) <- gsub(
"_corr", "",
colnames(sp_corrected)
)
# Print dimensions and preview
cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
print(head(sp_corrected))
plot(all_models$PH,
timePoints = 1,
plotType = "spatial",
spaTrend = "raw")
plot(all_models$TKW,
timePoints = 1,
plotType = "spatial",
spaTrend = "raw")
# T-test: Inv4m vs CTRL with FDR correction
htest <- sp_corrected %>%
pivot_longer(
cols = all_of(usable_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
group_by(trait) %>%
t_test(value ~ genotype) %>%
adjust_pvalue(method = "fdr") %>%
add_y_position(scales = "free") %>%
add_significance()
# Filter to only FDR-significant traits (optional)
alpha <- 0.05
sig_traits <- htest %>%
filter(p.adj < alpha) %>%
arrange(p.adj) %>%
pull(trait)
sig_traits
## [1] "PH" "DTA" "DTS" "HI" "CD" "EW" "TKW"
pheno_theme2 <- ggpubr::theme_classic2(base_size = 20) +
ggplot2::theme(
plot.title = ggtext::element_markdown(hjust = 0.5, face ="bold"),
axis.title.y = ggtext::element_markdown(),
axis.title.x = element_blank(),
axis.text.x = element_text(size=25, face="bold", color= "black"),
strip.background = element_blank(),
strip.text = ggtext::element_markdown(size=20),
legend.position = "none")
if (length(sig_traits) == 0) {
warning("No traits significant after FDR correction.")
} else {
# Prepare data: pivot once, filter to significant traits
plot_data <- sp_corrected %>%
pivot_longer(
cols = all_of(usable_phenotypes),
names_to = "trait",
values_to = "value"
) %>%
filter(trait %in% sig_traits) %>%
mutate(trait= as.character(trait))
# Define y positions for brackets (just above each panel's max)
bracket_heights <- plot_data %>%
group_by(trait) %>%
summarise(y_pos = max(value, na.rm = TRUE) * 1.05, .groups = "drop")
# Join with p_value_result for labels
test_to_plot <- htest %>%
filter(trait %in% sig_traits)
# Make the plot
diff_plot <- ggplot(plot_data, aes(x = genotype, y = value, color = genotype)) +
ggplot2::geom_boxplot(width = 0.25, linewidth=1.5,alpha=0) %>% with_shadow(
colour = "black",
x_offset = 0,
y_offset = 0,
sigma = 1) +
ggbeeswarm::geom_quasirandom(size=2) %>% with_shadow(
colour = "black",
x_offset = 0,
y_offset = 0,
sigma = 1) +
scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
labs(
title = "Inv4m vs CTRL: Spatially Corrected Phenotypes (FDR-Adjusted)",
y = "Corrected Value",
x = "Genotype"
) +
stat_pvalue_manual(
test_to_plot ,
tip.length = 0.01,
step.height = 0.08,
size = 10,
bracket.size = 0.8,
) +
scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
facet_wrap(~ factor(trait, levels = sig_traits), scales = "free_y", ncol = 3) +
pheno_theme2
}
print(diff_plot)