PSU 2022 Spatial Analysis: SpATS Spatial Correction with GDD Integration

Overview

This notebook fits a SpATS PSANOVA surface per phenotype and returns spatially-corrected plot-level values for each phenotype, including GDD-based traits (GDDA and GDDS). The approach uses statgenHTP::fitModels() (which wraps SpATS when provided a TimePoints object) and getCorrected() to extract corrected plot-level observations. The workflow:

Load GDD lookup table for PSU2022.
Load and clean data (as in prior notebook).
Create plot_data (plot-level means) and merge GDD values.
Build a TimePoints object for statgenHTP.
Loop over usable phenotypes (including GDDA, GDDS) and apply fitModels() → getCorrected().
Merge corrected values into a wide table sp_corrected.
Save corrected table for downstream analyses.

Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.

1. Load libraries

library(tidyverse)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme)
library(gstat)
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS)
library(statgenHTP)
library(ggbeeswarm)

1b. Configuration and load GDD lookup table

environment_name <- "PSU2022"
planting_date <- mdy("5/26/2022")
gdd_lookup_file <- "gdd_lookup_table.csv"

gdd_at_phenology <- read_csv(gdd_lookup_file) %>%
  filter(environment == environment_name)

cat("Loaded GDD lookup for", environment_name, "with", 
    nrow(gdd_at_phenology), "dates\n")

## Loaded GDD lookup for PSU2022 with 127 dates

cat("GDD range:", min(gdd_at_phenology$cumulative_gdd_from_planting), "to", 
    max(gdd_at_phenology$cumulative_gdd_from_planting), "°C\n")

## GDD range: 6.28 to 1412.355 °C

2. Load and prepare data

DATA_DIR <- "/Users/fvrodriguez/Desktop/inv4mHighland/data"
plant_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field.csv")
ear_csv <- file.path(DATA_DIR, "22_NCS_PSU_LANGEBIO_FIELDS_PSU_P_field_ear_pheno.csv")

field_data_raw <- read.csv(plant_csv, na.strings = c("", "#N/A", "NA"))
ear_data_raw <- read.csv(ear_csv, na.strings = c("", "n/a", "NA"), skip = 1)

field_data <- field_data_raw %>%
  filter(P22. >= 3004, P22. <= 4192) %>%
  rename(
    rowid = P22.,
    Genotype = Who.What,
    PH = Height_Anthesis,
    STW40 = X40_DAP_dw,
    STW50 = X50_DAP_dw, 
    STW60 = X60_DAP_dw,
    STWHV = harvest_dw
  ) %>%
  mutate(
    Treatment = factor(Treatment, levels = c("HighP", "LowP")),
    Genotype = factor(Genotype),
    Rep = as.factor(Rep)
  ) %>%
  mutate(
    Treatment = fct_recode(Treatment, "+P" = "HighP", "-P" = "LowP"),
    Genotype = fct_recode(Genotype, "Inv4m" = "INV4M")
  )

ear_data <- ear_data_raw %>%
  select(-description, -RK, -CC, -NIR, ear_rep = rep) %>%
  rename(rowid = row) %>%
  arrange(rowid) %>%
  group_by(rowid) %>%
  select(-ear_rep) %>%
  summarise_all(mean, na.rm = TRUE) %>%
  droplevels()

field_data_complete <- field_data %>%
  left_join(ear_data, by = "rowid") %>%
  mutate(HI = TKW / STWHV) %>%
  mutate(
    Plot_Column = case_when(
      Plot == "PlotVIII" ~ Plot_Column + 10,
      Plot == "PlotVI" ~ Plot_Column,
      TRUE ~ Plot_Column
    )
  ) %>%
  filter(!is.na(PH)) %>%
  droplevels()

base_phenotypes <- c("PH", "DTA", "DTS", "STW40", "STW50", "STW60", "STWHV")
ear_phenotypes <- c("EW", "EL", "ED", "KRN", "CD", "TKW", "KW50", "TKN", "HI")
all_phenotypes <- unique(c(base_phenotypes, ear_phenotypes))

cat("Field data dimensions:", dim(field_data_complete), "\n")

## Field data dimensions: 127 39

cat("Phenotypes available:", paste(all_phenotypes, collapse = ", "), "\n")

## Phenotypes available: PH, DTA, DTS, STW40, STW50, STW60, STWHV, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI

3. Create plot-level data with GDD integration

plot_data <- field_data_complete %>%
  group_by(rowid, Rep, Plot_Row, Plot_Column, Genotype, Treatment) %>%
  summarise(
    across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
    n_plants = n(),
    .groups = 'drop'
  ) %>%
  mutate(
    across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x)),
    environment = environment_name,
    anthesis_date = planting_date + days(round(DTA)),
    silking_date = planting_date + days(round(DTS))
  ) %>%
  left_join(
    gdd_at_phenology %>% 
      rename(anthesis_date = date, GDDA = cumulative_gdd_from_planting),
    by = c("environment", "anthesis_date")
  ) %>%
  left_join(
    gdd_at_phenology %>% 
      rename(silking_date = date, GDDS = cumulative_gdd_from_planting),
    by = c("environment", "silking_date")
  ) %>%
  select(-environment, -anthesis_date, -silking_date)

plot_data$Plot_Row <- as.numeric(plot_data$Plot_Row)
plot_data$Plot_Column <- as.numeric(plot_data$Plot_Column)
plot_data$Rep <- as.factor(plot_data$Rep)

cat("Plot-level summary dimensions:", dim(plot_data), "\n")

## Plot-level summary dimensions: 127 25

cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")

## Average plants per plot: 1

cat("GDD columns added: GDDA (anthesis), GDDS (silking)\n")

## GDD columns added: GDDA (anthesis), GDDS (silking)

cat("Plots with GDDA:", sum(!is.na(plot_data$GDDA)), "\n")

## Plots with GDDA: 127

cat("Plots with GDDS:", sum(!is.na(plot_data$GDDS)), "\n")

## Plots with GDDS: 127

4. Identify usable phenotypes (<50% missing)

all_traits <- c(all_phenotypes, "GDDA", "GDDS")

missing_summary <- plot_data %>%
  select(all_of(all_traits)) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
  mutate(
    total_obs = nrow(plot_data),
    missing_pct = round(missing_count / total_obs * 100, 1),
    available_n = total_obs - missing_count
  ) %>%
  arrange(missing_pct)

print(missing_summary)

## # A tibble: 18 × 5
##    phenotype missing_count total_obs missing_pct available_n
##    <chr>             <int>     <int>       <dbl>       <int>
##  1 PH                    0       127         0           127
##  2 DTA                   0       127         0           127
##  3 DTS                   0       127         0           127
##  4 GDDA                  0       127         0           127
##  5 GDDS                  0       127         0           127
##  6 STW40                 2       127         1.6         125
##  7 STWHV                 2       127         1.6         125
##  8 STW50                15       127        11.8         112
##  9 STW60                20       127        15.7         107
## 10 EW                   22       127        17.3         105
## 11 EL                   22       127        17.3         105
## 12 ED                   22       127        17.3         105
## 13 KRN                  22       127        17.3         105
## 14 CD                   22       127        17.3         105
## 15 TKW                  22       127        17.3         105
## 16 KW50                 22       127        17.3         105
## 17 TKN                  22       127        17.3         105
## 18 HI                   22       127        17.3         105

usable_phenotypes <- missing_summary %>%
  filter(missing_pct < 50) %>%
  pull(phenotype)

cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")

## Usable phenotypes (<50% missing): PH, DTA, DTS, GDDA, GDDS, STW40, STWHV, STW50, STW60, EW, EL, ED, KRN, CD, TKW, KW50, TKN, HI

5. Build TimePoints object for statgenHTP

single_time <- plot_data %>%
  mutate(
    Genotype = as.character(Genotype),
    Treatment = as.character(Treatment),
    Rep = as.character(Rep),
    Plot_Row = as.numeric(Plot_Row),
    Plot_Column = as.numeric(Plot_Column),
    time = as.POSIXct("2022-05-22 10:00:00")
  )

tp_data <- createTimePoints(
  dat = single_time,
  experimentName = environment_name,
  genotype = "Genotype",
  timePoint = "time",
  plotId = "rowid",
  repId = "Rep",
  rowNum = "Plot_Row",
  colNum = "Plot_Column",
  checkGenotypes = c("NUE", "B73"),
  addCheck = TRUE
)

summary(tp_data)

## tp_data contains data for experiment PSU2022.
## 
## It contains 1 time points.
## First time point: 2022-05-22 10:00:00 
## Last time point: 2022-05-22 10:00:00 
## 
## The following genotypes are defined as check genotypes: NUE, B73.

6. Fit SpATS per phenotype and extract corrected values

corrected_list <- list()
fit_info <- list()
all_models <- list()

for (phen in usable_phenotypes) {
  message("Fitting SpATS model for: ", phen)
  
  no_nas <- single_time %>%
    filter(!is.na(.data[[phen]]))
  
  if (nrow(no_nas) < 10) {
    message("Insufficient data for ", phen, " (n=", nrow(no_nas), ")")
    next
  }
  
  fit_try <- try(
    fitModels(
      TP = tp_data,
      trait = phen,
      extraFixedFactors = c("repId", "Treatment"),
      timePoints = 1,
      what = "fixed"
    ),
    silent = TRUE
  )
  
  if (inherits(fit_try, "try-error")) {
    message("Fit failed for ", phen, ": ", fit_try)
    next
  }
  
  all_models[[phen]] <- fit_try
  
  corr <- try(getCorrected(fit_try), silent = TRUE)
  if (inherits(corr, "try-error") || is.null(corr)) {
    message("getCorrected() failed for ", phen)
    next
  }
  
  corr <- corr %>%
    select(-phen)
  
  corrected_list[[phen]] <- corr
  fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}

sp_corrected <- lapply(
  corrected_list,
  function(corr_df) {
    corr_df %>%
      select(timeNumber, timePoint, genotype, repId, Treatment, 
             rowId, colId, plotId)
  }
) %>%
  bind_rows() %>%
  arrange(plotId) %>%
  distinct()

for (phen in names(corrected_list)) {
  phen_corr <- paste0(phen, "_corr")
  corr_df <- corrected_list[[phen]] %>%
    select(plotId, all_of(phen_corr))
  sp_corrected <- left_join(sp_corrected, corr_df, by = "plotId")
}

colnames(sp_corrected) <- gsub("_corr", "", colnames(sp_corrected))

cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
cat("Corrected traits:", 
    paste(intersect(usable_phenotypes, colnames(sp_corrected)), 
          collapse = ", "), "\n")
print(head(sp_corrected))

7. Spatial correction visualization (example for PH and HI)

if ("PH" %in% names(all_models)) {
  plot(all_models$PH, timePoints = 1)
  plot(all_models$PH, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

if ("HI" %in% names(all_models)) {
  plot(all_models$HI, timePoints = 1)
  plot(all_models$HI, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

if ("GDDA" %in% names(all_models)) {
  cat("\n### Spatial plots for GDDA\n")
  plot(all_models$GDDA, timePoints = 1)
  plot(all_models$GDDA, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

## 
## ### Spatial plots for GDDA

if ("GDDS" %in% names(all_models)) {
  cat("\n### Spatial plots for GDDS\n")
  plot(all_models$GDDS, timePoints = 1)
  plot(all_models$GDDS, timePoints = 1, plotType = "spatial", spaTrend = "raw")
}

## 
## ### Spatial plots for GDDS

8. Spatially Corrected Inv4m Effects from Combined P Treatments

NIL_corrected <- sp_corrected %>%
  filter(genotype %in% c("Inv4m", "CTRL")) %>%
  droplevels()

htest <- NIL_corrected %>%
  pivot_longer(
    cols = all_of(usable_phenotypes), 
    names_to = "trait", 
    values_to = "value"
  ) %>%
  filter(!is.na(value)) %>%
  group_by(trait) %>%
  t_test(value ~ genotype) %>%
  adjust_pvalue(method = "fdr") %>%
  add_y_position(scales = "free") %>%
  add_significance() %>%
  arrange(p.adj)

print(htest)

## # A tibble: 18 × 13
##    trait .y.   group1 group2    n1    n2 statistic    df           p      p.adj
##    <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>       <dbl>      <dbl>
##  1 PH    value CTRL   Inv4m     32    32    -5.82   61.3 0.000000232 0.00000418
##  2 DTA   value CTRL   Inv4m     32    32     4.89   60.4 0.00000793  0.0000714 
##  3 DTS   value CTRL   Inv4m     32    32     4.73   58.9 0.0000143   0.0000738 
##  4 GDDA  value CTRL   Inv4m     32    32     4.69   60.1 0.0000164   0.0000738 
##  5 GDDS  value CTRL   Inv4m     32    32     4.60   55.7 0.0000247   0.0000889 
##  6 HI    value CTRL   Inv4m     26    27    -2.65   48.6 0.0107      0.0321    
##  7 CD    value CTRL   Inv4m     26    27     2.41   35.6 0.0211      0.0475    
##  8 STWHV value CTRL   Inv4m     30    32     2.41   58.6 0.0191      0.0475    
##  9 TKW   value CTRL   Inv4m     26    27    -2.12   43.6 0.0396      0.0792    
## 10 KRN   value CTRL   Inv4m     26    27     2.00   43.7 0.0515      0.0854    
## 11 STW40 value CTRL   Inv4m     31    32     1.98   59.9 0.0522      0.0854    
## 12 EW    value CTRL   Inv4m     26    27    -1.95   43.6 0.0582      0.0873    
## 13 ED    value CTRL   Inv4m     26    27     1.74   34.0 0.0913      0.126     
## 14 STW50 value CTRL   Inv4m     28    29     1.57   51.0 0.123       0.158     
## 15 EL    value CTRL   Inv4m     26    27    -1.31   48.6 0.195       0.234     
## 16 TKN   value CTRL   Inv4m     26    27    -1.27   42.8 0.21        0.236     
## 17 STW60 value CTRL   Inv4m     26    28    -0.549  48.7 0.586       0.620     
## 18 KW50  value CTRL   Inv4m     26    27    -0.332  37.4 0.742       0.742     
## # ℹ 3 more variables: y.position <dbl>, groups <named list>, p.adj.signif <chr>

alpha <- 0.05
sig_traits <- htest %>%
  filter(p.adj < alpha) %>%
  arrange(p.adj) %>%
  pull(trait)

cat("Significant traits (FDR < 0.05):", paste(sig_traits, collapse = ", "), "\n")

## Significant traits (FDR < 0.05): PH, DTA, DTS, GDDA, GDDS, HI, CD, STWHV

pheno_theme2 <- theme_classic2(base_size = 20) +
  theme(
    plot.title = element_markdown(hjust = 0.5, face = "bold"),
    axis.title.y = element_markdown(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(size = 25, face = "bold", color = "black"),
    strip.background = element_blank(),
    strip.text = element_markdown(size = 20),
    legend.position = "none"
  )

if (length(sig_traits) == 0) {
  warning("No traits significant after FDR correction.")
} else {
  plot_data_sig <- NIL_corrected %>%
    pivot_longer(
      cols = all_of(usable_phenotypes),
      names_to = "trait",
      values_to = "value"
    ) %>%
    filter(trait %in% sig_traits, !is.na(value)) %>%
    mutate(trait = as.character(trait))

  test_to_plot <- htest %>%
    filter(trait %in% sig_traits)

  diff_plot <- ggplot(plot_data_sig, aes(x = genotype, y = value, color = genotype)) +
    geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) %>% 
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
    geom_quasirandom(size = 2) %>% 
    with_shadow(colour = "black", x_offset = 0, y_offset = 0, sigma = 1) +
    scale_color_manual(values = c("CTRL" = "gold", "Inv4m" = "purple4")) +
    labs(
      title = "Inv4m vs CTRL: Spatially Corrected Phenotypes (FDR-Adjusted)",
      y = "Corrected Value",
      x = "Genotype"
    ) +
    stat_pvalue_manual(
      test_to_plot,
      tip.length = 0.01,
      step.height = 0.08,
      size = 10,
      bracket.size = 0.8
    ) +
    scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
    facet_wrap(~ factor(trait, levels = sig_traits), scales = "free_y", ncol = 4) +
    pheno_theme2

  print(diff_plot)
}

9. Summary and export

cat("\n=== Analysis Summary ===\n")

## 
## === Analysis Summary ===

cat("Total phenotypes evaluated:", length(all_traits), "\n")

## Total phenotypes evaluated: 18

cat("Usable phenotypes (<50% missing):", length(usable_phenotypes), "\n")

## Usable phenotypes (<50% missing): 18

cat("Successfully fitted models:", length(all_models), "\n")

## Successfully fitted models: 18

if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
  cat("Plots with corrected data:", nrow(sp_corrected), "\n")
  
  output_file <- paste0(environment_name, "_spatially_corrected_phenotypes.csv")
  write_csv(sp_corrected, output_file)
  cat("Spatially corrected data exported to:", output_file, "\n")
} else {
  cat("No corrected data to export\n")
}

## Plots with corrected data: 127 
## Spatially corrected data exported to: PSU2022_spatially_corrected_phenotypes.csv

if (length(fit_info) > 0) {
  cat("\n=== Model Fit Details ===\n")
  for (trait in names(fit_info)) {
    info <- fit_info[[trait]]
    cat(sprintf("%s: n=%d, status=%s\n", trait, info$n_obs, info$status))
  }
}

## 
## === Model Fit Details ===
## PH: n=127, status=success
## DTA: n=127, status=success
## DTS: n=127, status=success
## GDDA: n=127, status=success
## GDDS: n=127, status=success
## STW40: n=125, status=success
## STWHV: n=125, status=success
## STW50: n=112, status=success
## STW60: n=107, status=success
## EW: n=105, status=success
## EL: n=105, status=success
## ED: n=105, status=success
## KRN: n=105, status=success
## CD: n=105, status=success
## TKW: n=105, status=success
## KW50: n=105, status=success
## TKN: n=105, status=success
## HI: n=105, status=success