Overview

This notebook fits a SpATS PSANOVA surface per phenotype and returns spatially-corrected plot-level values for each phenotype. The approach uses statgenHTP::fitModels() (which wraps SpATS when provided a TimePoints object) and getCorrected() to extract corrected plot-level observations. The workflow:

  1. Load and clean data (as in prior notebook).
  2. Create plot_data (plot-level means).
  3. Build a TimePoints object for statgenHTP.
  4. Loop over usable phenotypes and apply fitModels() → getCorrected().
  5. Merge corrected values into a wide table sp_corrected.
  6. Save corrected table for downstream analyses.

Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.

1. Load libraries

library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme)          # Mixed-effects models (if needed)
library(gstat)         # Variograms
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS)         # SpATS package (used internally by statgenHTP)
library(statgenHTP)    # Provides fitModels() and getCorrected()

2. Load and prepare data (as provided)

DATA_DIR <- "/Users/fvrodriguez/Desktop/inv4mHighland/data"
plant_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field.csv")
ear_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field_ear_pheno.csv")

field_data_raw <- read.csv(plant_csv, na.strings = c("", "#N/A", "NA"))
ear_data_raw <- read.csv(ear_csv, na.strings = c("", "n/a", "NA"), skip = 1)

field_data <- field_data_raw %>%
  filter(P22. >= 3004, P22. <= 4192) %>%
  rename(
    rowid = P22.,
    Genotype = Who.What,
    PH = Height_Anthesis,
    STW40 = X40_DAP_dw,
    STW50 = X50_DAP_dw, 
    STW60 = X60_DAP_dw,
    STWHV = harvest_dw
  ) %>%
  filter(Genotype %in% c("CTRL", "INV4M")) %>%
  mutate(
    Treatment = factor(Treatment, levels = c("HighP", "LowP")),
    Genotype = factor(Genotype, levels = c("CTRL", "INV4M")),
    Rep = as.factor(Rep)
  ) %>%
  mutate(
    Treatment = fct_recode(Treatment, "+P" = "HighP", "-P" = "LowP"),
    Genotype = fct_recode(Genotype, "Inv4m" = "INV4M")
  )

ear_data <- ear_data_raw %>%
  select(-description, -RK, -CC, -NIR, ear_rep = rep) %>%
  rename(rowid = row) %>%
  arrange(rowid) %>%
  group_by(rowid) %>%
  select(-ear_rep) %>%
  summarise_all(mean, na.rm = TRUE) %>%
  droplevels()

field_data_complete <- field_data %>%
  left_join(ear_data, by = "rowid") %>%
  mutate(HI = TKW / STWHV) %>%
  mutate(
    Plot_Column = case_when(
      Plot == "PlotVIII" ~ Plot_Column + 10,
      Plot == "PlotVI" ~ Plot_Column,
      TRUE ~ Plot_Column
    )
  ) %>%
  filter(!is.na(PH)) %>%
  droplevels()

# Define phenotypes
base_phenotypes <- c("PH", "DTA", "DTS", "STW40", "STW50", "STW60", "STWHV")
ear_phenotypes <- c("EW", "EL", "ED", "KRN", "CD", "TKW", "KW50", "TKN", "HI")
all_phenotypes <- unique(c(base_phenotypes, ear_phenotypes))

plot_data <- field_data_complete %>%
  group_by(rowid, Rep, Plot_Row, Plot_Column, Genotype, Treatment) %>%
  summarise(
    across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
    n_plants = n(),
    .groups = 'drop'
  ) %>%
  mutate(across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x)))

plot_data$Plot_Row <- as.numeric(plot_data$Plot_Row)
plot_data$Plot_Column <- as.numeric(plot_data$Plot_Column)
plot_data$Rep <- as.factor(plot_data$Rep)

cat("Plot-level summary dimensions:", dim(plot_data), "\n")
## Plot-level summary dimensions: 64 23
cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")
## Average plants per plot: 1

3. Identify usable phenotypes (<50% missing)

missing_summary <- plot_data %>%
  select(all_of(all_phenotypes)) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
  mutate(
    total_obs = nrow(plot_data),
    missing_pct = round(missing_count / total_obs * 100, 1),
    available_n = total_obs - missing_count
  ) %>%
  arrange(missing_pct)

print(missing_summary)
## # A tibble: 16 × 5
##    phenotype missing_count total_obs missing_pct available_n
##    <chr>             <int>     <int>       <dbl>       <int>
##  1 PH                    0        64         0            64
##  2 DTA                   0        64         0            64
##  3 DTS                   0        64         0            64
##  4 STW40                 1        64         1.6          63
##  5 STWHV                 2        64         3.1          62
##  6 STW50                 7        64        10.9          57
##  7 STW60                10        64        15.6          54
##  8 EW                   11        64        17.2          53
##  9 EL                   11        64        17.2          53
## 10 ED                   11        64        17.2          53
## 11 KRN                  11        64        17.2          53
## 12 CD                   11        64        17.2          53
## 13 TKW                  11        64        17.2          53
## 14 KW50                 11        64        17.2          53
## 15 TKN                  11        64        17.2          53
## 16 HI                   11        64        17.2          53
usable_phenotypes <- missing_summary %>%
  filter(missing_pct < 50) %>%
  pull(phenotype)

cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")
## Usable phenotypes (<50% missing): PH, DTA, DTS, STW40, STWHV, STW50, STW60, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI

4. Build TimePoints object for statgenHTP

single_time <- plot_data %>%
  mutate(
    Genotype = as.character(Genotype),   # statgenHTP expects character genotype IDs
    Treatment = as.character(Treatment),
    Rep = as.character(Rep),
    Plot_Row = as.numeric(Plot_Row),
    Plot_Column = as.numeric(Plot_Column)
  ) %>%
  mutate(time = as.POSIXct("2022-09-15 10:00:00"))

tp_data <- createTimePoints(
  dat = single_time,
  experimentName = "PSU2022",
  genotype = "Genotype",
  timePoint = "time",
  plotId = "rowid",  # unique ID per plot
  repId = "Rep",
  rowNum = "Plot_Row",
  colNum = "Plot_Column",
  addCheck = FALSE
)
summary(tp_data)
## tp_data contains data for experiment PSU2022.
## 
## It contains 1 time points.
## First time point: 2022-09-15 10:00:00 
## Last time point: 2022-09-15 10:00:00 
## 
## No check genotypes are defined.

5. Fit SpATS per phenotype and extract corrected values

## 5. Fit SpATS models and extract corrected phenotypes

# Initialize lists
corrected_list <- list()
fit_info <- list()
all_models <- list()
for (phen in usable_phenotypes) {
  message("Fitting SpATS model for: ", phen)
  # Use single_time (has all phenotypes)
  no_nas <- single_time %>%
    filter(!is.na(.data[[phen]]))
  
  # Fit model
  fit_try <- try(
    fitModels(
      TP = tp_data,
      trait = phen,
      extraFixedFactors = c("repId", "Treatment"),
      timePoints = 1,
      what = "fixed"
    ),
    silent = TRUE
  )
  
  if (inherits(fit_try, "try-error")) {
    message("Fit failed for ", phen, ": ", fit_try)
    next
  }
  all_models[[phen]] <- fit_try
  # Extract corrected values
  corr <- try(getCorrected(fit_try), silent = TRUE)
  if (inherits(corr, "try-error") || is.null(corr)) {
    message("getCorrected() failed for ", phen)
    next
  } 
  corr <- corr %>%
    select(-phen) 

  corrected_list[[phen]] <- corr
  fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}
# Combine all corrected data into a single data frame

sp_corrected  <- lapply(
  corrected_list,
  function(corr_df) {
    corr_df %>%
   select(timeNumber,timePoint,genotype,repId,Treatment,rowId,colId,plotId) 
  }
) %>%
bind_rows() %>%
  arrange(plotId) %>%
  distinct()  
sp_corrected 


# Join each corrected phenotype
for (phen in names(corrected_list)) {
  phen_corr <- paste0(phen, "_corr")
  corr_df <- corrected_list[[phen]] %>%
    select(plotId,all_of(phen_corr))
  sp_corrected <- left_join(
    sp_corrected,
    corr_df
  )
}

colnames(sp_corrected) <- gsub(
  "_corr", "", 
  colnames(sp_corrected)
)


# Print dimensions and preview
cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
print(head(sp_corrected))

Spatial correction using SpATS TKW and PH example

plot(all_models$PH,
     timePoints = 1,
     plotType = "spatial",
     spaTrend = "raw") 

plot(all_models$TKW,
     timePoints = 1,
     plotType = "spatial",
     spaTrend = "raw") 

Spatially Corrected Inv4m Effects from Combined P Treatments

# T-test: Inv4m vs CTRL with FDR correction
htest <- sp_corrected %>%
  pivot_longer(
    cols = all_of(usable_phenotypes), 
    names_to = "trait", 
    values_to = "value"
  ) %>%
  group_by(trait) %>%
  t_test(value ~ genotype) %>%
  adjust_pvalue(method = "fdr") %>%
  add_y_position(scales = "free") %>%
  add_significance()  


# Filter to only FDR-significant traits (optional)
alpha <- 0.05
sig_traits <- htest %>%
  filter(p.adj < alpha) %>%
  arrange(p.adj) %>%
  pull(trait) 
  
sig_traits
## [1] "PH"  "DTA" "DTS" "HI"  "CD"  "EW"  "TKW"
pheno_theme2 <- ggpubr::theme_classic2(base_size = 20) +
  ggplot2::theme(
    plot.title = ggtext::element_markdown(hjust = 0.5, face ="bold"),
    axis.title.y = ggtext::element_markdown(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(size=25, face="bold", color= "black"),
    strip.background = element_blank(),
    strip.text = ggtext::element_markdown(size=20),
    legend.position = "none")

if (length(sig_traits) == 0) {
  warning("No traits significant after FDR correction.")
} else {
  # Prepare data: pivot once, filter to significant traits
  plot_data <- sp_corrected %>%
    pivot_longer(
      cols = all_of(usable_phenotypes),
      names_to = "trait",
      values_to = "value"
    ) %>%
    filter(trait %in% sig_traits) %>%
    mutate(trait= as.character(trait)) 

  # Define y positions for brackets (just above each panel's max)
  bracket_heights <- plot_data %>%
    group_by(trait) %>%
    summarise(y_pos = max(value, na.rm = TRUE) * 1.05, .groups = "drop")

  # Join with p_value_result for labels
    test_to_plot <- htest %>%
    filter(trait %in% sig_traits)

  # Make the plot
  diff_plot <- ggplot(plot_data, aes(x = genotype, y = value, color = genotype)) +
  ggplot2::geom_boxplot(width = 0.25, linewidth=1.5,alpha=0) %>% with_shadow(
    colour = "black",
    x_offset = 0,
    y_offset = 0,
    sigma = 1) +
  ggbeeswarm::geom_quasirandom(size=2) %>% with_shadow(
  colour = "black",
  x_offset = 0,
  y_offset = 0,
  sigma = 1) +
    scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
    labs(
      title = "Inv4m vs CTRL: Spatially Corrected Phenotypes (FDR-Adjusted)",
      y = "Corrected Value",
      x = "Genotype"
    ) +
    stat_pvalue_manual(
      test_to_plot ,
      tip.length = 0.01,
      step.height = 0.08,
      size = 10,
    bracket.size = 0.8,
    ) +
    scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
    facet_wrap(~ factor(trait, levels = sig_traits), scales = "free_y", ncol = 3) +
   pheno_theme2

}
  print(diff_plot)