CLY 2025 Spatial Analysis: SpATS Spatial Correction for All Phenotypes

Overview

This notebook fits a SpATS PSANOVA surface per phenotype and returns spatially-corrected plot-level values for each phenotype. The approach uses statgenHTP::fitModels() (which wraps SpATS when provided a TimePoints object) and getCorrected() to extract corrected plot-level observations. The workflow:

Load and clean plant-level data from CLY2025.
Aggregate to plot-level means (matching PSU2022 structure).
Build a TimePoints object for statgenHTP.
Loop over usable phenotypes and apply fitModels() → getCorrected().
Merge corrected values into a wide table sp_corrected.
Save corrected table for downstream analyses.

Warning: fitting many SpATS models can be computationally intensive. Run interactively on a machine with sufficient RAM/CPU.

1. Load libraries

library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggtext)
library(ggfx)
library(nlme)          # Mixed-effects models (if needed)
library(gstat)         # Variograms
library(rstatix)
library(ggpubr)
library(VIM)
library(knitr)
library(SpATS)         # SpATS package (used internally by statgenHTP)
library(statgenHTP)    # Provides fitModels() and getCorrected()

2. Load and clean plant-level data

# Load the dataset
file_path <- "/Users/fvrodriguez/Desktop/inv4mHighland/data/CLY25_Inv4m.csv"
if (!file.exists(file_path)) {
  stop("Error: CLY25_Inv4m.csv not found in the working directory.")
} else {
  field_data_raw <- read.csv(file_path, na.strings = c("","#N/A","NA"))
  
  # Clean and prepare plant-level data
  field_data <- field_data_raw %>%
    # Calculate Estimated Blade Area (EBA) = 0.75 * BL * BW FIRST
    mutate(EBA = 0.75 * BL * BW) %>%
    rename(
      plant_id = plant,
      plotId = plot_id,
      block = rep,
      x = X_pos,
      y = Y_pos,
      inv4m = inv4m_gt
    ) %>%
    # Convert relevant columns to factors
    mutate(across(c(plotId, block, donor, inv4m), as.factor)) %>%
    # Add small amount of noise to x coordinates to avoid identical positions
    mutate(x = x + runif(n(), min = 0.0, max = 0.01))

  # Define phenotypes available in CLY2025
  base_phenotypes <- c("DTA", "DTS", "LAE", "PH", "EN", "SL", "BL", "BW", "EBA")
  
  # Verify all phenotypes exist in the dataset
  available_phenotypes <- intersect(base_phenotypes, names(field_data))
  all_phenotypes <- available_phenotypes
  
  cat("Plant-level data loaded and cleaned successfully!\n")
  cat("Phenotypes available for analysis:", paste(all_phenotypes, collapse = ", "), "\n")
  cat("Plant-level data dimensions:", dim(field_data), "\n")
}

## Plant-level data loaded and cleaned successfully!
## Phenotypes available for analysis: DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA 
## Plant-level data dimensions: 442 24

head(field_data)

##   plant_id row_id plotId plot_row plot_col block   y        x donor inv4m
## 1        1   4573      1        1        1     3 1.7 1.003038  TMEX INV4M
## 2        2   4573      1        1        1     3 2.3 1.006843  TMEX INV4M
## 3        3   4573      1        1        1     3 3.6 1.004255  TMEX INV4M
## 4        4   4573      1        1        1     3 4.0 1.009143  TMEX INV4M
## 5        5   4573      1        1        1     3 4.5 1.005382  TMEX INV4M
## 6        6   4573      1        1        1     3 5.0 1.008563  TMEX INV4M
##        group DOA DOS DTA DTS DTA_GDD DTS_GDD LAE  PH EN   SL   BL  BW      EBA
## 1 TMEX-INV4M  20  22  77  79    1915    1975   5 192  2 13.5 53.0 8.0 318.0000
## 2 TMEX-INV4M  20  22  77  79    1915    1975   5 209  3 14.0 50.5 7.5 284.0625
## 3 TMEX-INV4M  23  24  80  81    2011    2047   5 208  3   NA   NA  NA       NA
## 4 TMEX-INV4M  23  24  80  81    2011    2047   5 212  3 14.0 53.5 7.5 300.9375
## 5 TMEX-INV4M  21  23  78  80    1943    2011   5 218  3 13.5 56.0 8.0 336.0000
## 6 TMEX-INV4M  22  24  79  81    1975    2047   5 225  3 14.0 64.0 8.5 408.0000

3. Aggregate to plot-level data (matching PSU2022 structure)

# Aggregate from plant-level to plot-level means
plot_data <- field_data %>%
  group_by(plotId, block, plot_row, plot_col, donor, inv4m) %>%
  summarise(
    across(all_of(all_phenotypes), ~ mean(.x, na.rm = TRUE)),
    n_plants = n(),
    .groups = 'drop'
  ) %>%
  # Convert NaN to NA (from mean of all NA values)
  mutate(across(all_of(all_phenotypes), ~ ifelse(is.nan(.x), NA, .x))) %>%
  # Rename columns to match PSU2022 structure
  rename(# unique plot identifier
    Rep = block,               # replicate/block
    Plot_Row = plot_row,       # row position  
    Plot_Column = plot_col,    # column position
  ) %>%
  # Ensure proper data types
  mutate(
    Plot_Row = as.numeric(Plot_Row),
    Plot_Column = as.numeric(Plot_Column),
    Rep = as.factor(Rep),
    genotype = as.factor(paste(donor,inv4m, sep =".")),  # Combine donor and inv4m for genotype
  )

cat("Plot-level summary dimensions:", dim(plot_data), "\n")

## Plot-level summary dimensions: 64 17

cat("Average plants per plot:", round(mean(plot_data$n_plants, na.rm = TRUE), 1), "\n")

## Average plants per plot: 6.9

# Check factor levels
cat("Genotype levels:", levels(plot_data$genotype), "\n")

## Genotype levels: MI21.CTRL MI21.INV4M TMEX.CTRL TMEX.INV4M

cat("Rep levels:", levels(plot_data$Rep), "\n")

## Rep levels: 1 2 3 4

head(plot_data)

## # A tibble: 6 × 17
##   plotId Rep   Plot_Row Plot_Column donor inv4m   DTA   DTS   LAE    PH    EN
##   <fct>  <fct>    <dbl>       <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1      3            1           1 TMEX  INV4M  78.3  79.7  5.14  211.  2.86
## 2 2      3            2           1 MI21  INV4M  76.3  77.6  5.71  176.  2.57
## 3 3      3            3           1 MI21  CTRL   75    75.5  5.75  193   2.75
## 4 4      3            4           1 TMEX  CTRL   77.8  78.6  5.5   199.  2.5 
## 5 5      1            5           1 MI21  INV4M  75.9  76.8  5.62  179.  2.62
## 6 6      1            6           1 TMEX  INV4M  78    78.7  5     205   3   
## # ℹ 6 more variables: SL <dbl>, BL <dbl>, BW <dbl>, EBA <dbl>, n_plants <int>,
## #   genotype <fct>

4. Identify usable phenotypes (<50% missing)

missing_summary <- plot_data %>%
  select(all_of(all_phenotypes)) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "phenotype", values_to = "missing_count") %>%
  mutate(
    total_obs = nrow(plot_data),
    missing_pct = round(missing_count / total_obs * 100, 1),
    available_n = total_obs - missing_count
  ) %>%
  arrange(missing_pct)

print(missing_summary)

## # A tibble: 9 × 5
##   phenotype missing_count total_obs missing_pct available_n
##   <chr>             <int>     <int>       <dbl>       <int>
## 1 DTA                   0        64           0          64
## 2 DTS                   0        64           0          64
## 3 LAE                   0        64           0          64
## 4 PH                    0        64           0          64
## 5 EN                    0        64           0          64
## 6 SL                    0        64           0          64
## 7 BL                    0        64           0          64
## 8 BW                    0        64           0          64
## 9 EBA                   0        64           0          64

usable_phenotypes <- missing_summary %>%
  filter(missing_pct < 50) %>%
  pull(phenotype)

cat("Usable phenotypes (<50% missing):", paste(usable_phenotypes, collapse = ", "), "\n")

## Usable phenotypes (<50% missing): DTA, DTS, LAE, PH, EN, SL, BL, BW, EBA

5. Build TimePoints object for statgenHTP

single_time <- plot_data %>%
  mutate(
    plotId= as.character(plotId),  # Ensure plotId is character
    Rep = as.character(Rep),
    Plot_Row = as.numeric(Plot_Row),
    Plot_Column = as.numeric(Plot_Column)
  ) %>%
  mutate(time = as.POSIXct("2025-01-15 10:00:00"))  # Use CLY2025 date


tp_data <- createTimePoints(
  dat = single_time,
  experimentName = "CLY2025",
  genotype="genotype",
  timePoint = "time",
  plotId = "plotId",  # unique ID per plot
  repId = "Rep",
  rowNum = "Plot_Row",
  colNum = "Plot_Column",
  addCheck = FALSE
)
summary(tp_data)

## tp_data contains data for experiment CLY2025.
## 
## It contains 1 time points.
## First time point: 2025-01-15 10:00:00 
## Last time point: 2025-01-15 10:00:00 
## 
## No check genotypes are defined.

6. Fit SpATS per phenotype and extract corrected values

# Initialize lists
corrected_list <- list()
fit_info <- list()
all_models <- list()

for (phen in usable_phenotypes) {
  message("Fitting SpATS model for: ", phen)
  # Check if we have enough non-missing observations
  no_nas <- single_time %>%
    filter(!is.na(.data[[phen]]))
  
  if (nrow(no_nas) < 10) {
    message("Insufficient data for ", phen, " (n=", nrow(no_nas), ")")
    next
  }
  
  # Fit model
  fit_try <- try(
    fitModels(
      TP = tp_data,
      trait = phen,
      extraFixedFactors = c("repId"),
      timePoints = 1,
      what = "fixed"
    ),
    silent = TRUE
  )
  
  if (inherits(fit_try, "try-error")) {
    message("Fit failed for ", phen, ": ", fit_try)
    next
  }
  
  all_models[[phen]] <- fit_try
  
  # Extract corrected values
  corr <- try(getCorrected(fit_try), silent = TRUE)
  if (inherits(corr, "try-error") || is.null(corr)) {
    message("getCorrected() failed for ", phen)
    next
  } 
  
  corr <- corr %>%
    select(-phen) 

  corrected_list[[phen]] <- corr
  fit_info[[phen]] <- list(n_obs = nrow(no_nas), status = "success")
}

# Combine all corrected data into a single data frame
if (length(corrected_list) > 0) {
  sp_corrected <- lapply(
    corrected_list,
    function(corr_df) {
      corr_df %>%
        select(timeNumber, timePoint, genotype, repId, rowId, colId, plotId) 
    }
  ) %>%
    bind_rows() %>%
    arrange(plotId) %>%
    distinct()
  
  # Join each corrected phenotype
  for (phen in names(corrected_list)) {
    phen_corr <- paste0(phen, "_corr")
    corr_df <- corrected_list[[phen]] %>%
      select(plotId, all_of(phen_corr))
    sp_corrected <- left_join(
      sp_corrected,
      corr_df,
      by = "plotId"
    )
  }
  
  # Clean column names (remove _corr suffix)
  colnames(sp_corrected) <- gsub("_corr", "", colnames(sp_corrected))

sp_corrected <-  sp_corrected %>% 
separate(genotype,  c("donor","genotype"))  
  # Print dimensions and preview
  cat("Final sp_corrected dimensions:", dim(sp_corrected), "\n")
  print(head(sp_corrected))
  
} else {
  message("No models were successfully fitted.")
}

7. Spatial correction visualization (example for first two traits)

  trait_names <- c("DTS","PH")  # Example traits to visualize
  
  for (trait in trait_names) {
    cat("\n### Spatial plots for", trait, "\n")
    
    # Model diagnostic plots
    plot(all_models[[trait]], timePoints = 1)
    
    # Spatial trend plots
    plot(all_models[[trait]], 
         timePoints = 1,
         plotType = "spatial",
         spaTrend = "raw")
  }

## 
## ### Spatial plots for DTS

## 
## ### Spatial plots for PH

8. Donor-specific second-stage analysis: Spatially corrected effects

if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
  
  # Get corrected phenotype names (exclude metadata columns)
  corrected_phenotypes <- intersect(usable_phenotypes, colnames(sp_corrected))
  
  if (length(corrected_phenotypes) > 0) {
    
    # Define comparison groups
    comparison_genotypes <- unique(sp_corrected$genotype)
    cat("Available genotypes:", paste(comparison_genotypes, collapse = ", "), "\n")
    
    # Get unique donors
    donors <- unique(sp_corrected$donor)
    cat("Available donors:", paste(donors, collapse = ", "), "\n")
    
    # T-test comparing genotypes with FDR correction (per donor)
    htest <- sp_corrected %>% 
      pivot_longer(
        cols = all_of(corrected_phenotypes), 
        names_to = "trait", 
        values_to = "value"
      ) %>%
      filter(!is.na(value)) %>%
      group_by(donor, trait) %>%
      # Only perform t-test if we have exactly 2 genotype levels
      filter(n_distinct(genotype) == 2) %>%
      t_test(value ~ genotype) %>%
      adjust_pvalue(method = "fdr") %>%
      add_y_position(scales = "free") %>%
      add_significance()
    
    print(htest)
    
    # Plot theme
    pheno_theme2 <- ggpubr::theme_classic2(base_size = 20) +
      ggplot2::theme(
        plot.title = ggtext::element_markdown(hjust = 0.5, face = "bold"),
        axis.title.y = ggtext::element_markdown(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 25, face = "bold", color = "black"),
        strip.background = element_blank(),
        strip.text = ggtext::element_markdown(size = 20),
        legend.position = "none"
      )
    
    # Main loop: iterate through each donor
    for (current_donor in donors) {
      cat("\n=== Processing donor:", current_donor, "===\n")
      
      # Donor-specific filtering: get significant traits for this donor only
      alpha <- 0.05
      sig_traits_donor <- htest %>%
        filter(donor == current_donor, p.adj < alpha) %>%
        arrange(p.adj) %>%
        pull(trait)
      
      cat("Significant traits for", current_donor, "(FDR < 0.05):", 
          paste(sig_traits_donor, collapse = ", "), "\n")
      
      if (length(sig_traits_donor) == 0) {
        cat("No traits significant after FDR correction for donor", current_donor, "\n")
        next
      }
      
      # Data filtering: prepare data for this donor and significant traits only
      plot_data_donor <- sp_corrected %>%
        filter(donor == current_donor) %>%
        pivot_longer(
          cols = all_of(corrected_phenotypes),
          names_to = "trait",
          values_to = "value"
        ) %>%
        filter(trait %in% sig_traits_donor, !is.na(value)) %>%
        mutate(trait = as.character(trait))
      
      # Statistical tests: get p-values for this donor's significant traits
      test_to_plot_donor <- htest %>%
        filter(donor == current_donor, trait %in% sig_traits_donor)
      
      # Plot customization: adaptive faceting based on number of traits
      n_traits <- length(sig_traits_donor)
      ncol_facets <- min(4, n_traits)
      
      # Create donor-specific plot with custom title
      diff_plot <- ggplot(plot_data_donor, aes(x = genotype, y = value, color = genotype)) +
        ggplot2::geom_boxplot(width = 0.25, linewidth = 1.5, alpha = 0) +
        ggbeeswarm::geom_quasirandom(size = 2) +
        scale_color_manual(values = c("CTRL" = "gold", "INV4M" = "purple4")) +
        labs(
          title = paste("CLY25: ", current_donor, " Spatially Corrected (FDR-Adjusted)"),
          y = "Corrected Value",
          x = "Genotype"
        ) +
        stat_pvalue_manual(
          test_to_plot_donor,
          tip.length = 0.01,
          step.height = 0.08,
          size = 10,
          bracket.size = 0.8
        ) +
        scale_y_continuous(expand = expansion(mult = c(0.1, 0.15))) +
        facet_wrap(~ factor(trait, levels = sig_traits_donor), 
                   scales = "free_y", 
                   ncol = ncol_facets) +
        pheno_theme2
      
      print(diff_plot)
      
      # Print summary statistics for this donor
      cat("\n--- Summary for", current_donor, "---\n")
      summary_stats <- plot_data_donor %>%
        group_by(trait, genotype) %>%
        summarise(
          n = n(),
          mean = round(mean(value, na.rm = TRUE), 3),
          sd = round(sd(value, na.rm = TRUE), 3),
          .groups = "drop"
        ) %>%
        pivot_wider(
          names_from = genotype,
          values_from = c(n, mean, sd),
          names_sep = "_"
        )
      
      print(summary_stats)
    }
    
    # Overall summary across all donors
    cat("\n=== Overall Summary Across All Donors ===\n")
    overall_sig <- htest %>%
      filter(p.adj < alpha) %>%
      group_by(trait) %>%
      summarise(
        n_donors_significant = n(),
        min_pvalue = min(p.adj),
        max_pvalue = max(p.adj),
        .groups = "drop"
      ) %>%
      arrange(desc(n_donors_significant), min_pvalue)
    
    print(overall_sig)
    
  } else {
    message("No corrected phenotypes available for analysis")
  }
} else {
  message("No spatially corrected data available")
}

## Available genotypes: INV4M, CTRL 
## Available donors: TMEX, MI21 
## # A tibble: 18 × 14
##    donor trait .y.   group1 group2    n1    n2 statistic    df         p   p.adj
##    <chr> <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>     <dbl>   <dbl>
##  1 MI21  BL    value CTRL   INV4M     16    16     5.95   29.9   1.63e-6 7.33e-6
##  2 MI21  BW    value CTRL   INV4M     16    16     5.64   30.0   3.86e-6 1.39e-5
##  3 MI21  DTA   value CTRL   INV4M     16    16     2.19   29.7   3.62e-2 4.98e-2
##  4 MI21  DTS   value CTRL   INV4M     16    16     3.16   30.0   3.57e-3 6.43e-3
##  5 MI21  EBA   value CTRL   INV4M     16    16     6.97   29.6   1.04e-7 6.24e-7
##  6 MI21  EN    value CTRL   INV4M     16    16    -0.903  29.8   3.74e-1 3.96e-1
##  7 MI21  LAE   value CTRL   INV4M     16    16    -2.16   29.7   3.87e-2 4.98e-2
##  8 MI21  PH    value CTRL   INV4M     16    16     3.62   29.7   1.09e-3 2.18e-3
##  9 MI21  SL    value CTRL   INV4M     16    16    -0.988  27.8   3.32e-1 3.74e-1
## 10 TMEX  BL    value CTRL   INV4M     16    16     1.17   29.9   2.52e-1 3.02e-1
## 11 TMEX  BW    value CTRL   INV4M     16    16     2.88   26.7   7.79e-3 1.27e-2
## 12 TMEX  DTA   value CTRL   INV4M     16    16    -8.05   29.3   6.61e-9 1.19e-7
## 13 TMEX  DTS   value CTRL   INV4M     16    16    -7.86   26.2   2.3 e-8 2.07e-7
## 14 TMEX  EBA   value CTRL   INV4M     16    16     2.62   29.5   1.37e-2 2.06e-2
## 15 TMEX  EN    value CTRL   INV4M     16    16    -4.25   27.9   2.15e-4 4.84e-4
## 16 TMEX  LAE   value CTRL   INV4M     16    16     4.54   29.1   8.97e-5 2.31e-4
## 17 TMEX  PH    value CTRL   INV4M     16    16    -4.70   25.5   7.66e-5 2.30e-4
## 18 TMEX  SL    value CTRL   INV4M     16    16     0.214  30.0   8.32e-1 8.32e-1
## # ℹ 3 more variables: y.position <dbl>, groups <named list>, p.adj.signif <chr>
## 
## === Processing donor: TMEX ===
## Significant traits for TMEX (FDR < 0.05): DTA, DTS, PH, LAE, EN, BW, EBA

## 
## --- Summary for TMEX ---
## # A tibble: 7 × 7
##   trait n_CTRL n_INV4M mean_CTRL mean_INV4M sd_CTRL sd_INV4M
##   <chr>  <int>   <int>     <dbl>      <dbl>   <dbl>    <dbl>
## 1 BW        16      16      8.62       8.19   0.336    0.485
## 2 DTA       16      16     77.7       79.3    0.545    0.639
## 3 DTS       16      16     78.8       80.6    0.503    0.75 
## 4 EBA       16      16    393.       368.    24.5     27.8  
## 5 EN        16      16      2.20       2.53   0.183    0.244
## 6 LAE       16      16      6.03       5.73   0.203    0.171
## 7 PH        16      16    194.       204.     4.74     7.40 
## 
## === Processing donor: MI21 ===
## Significant traits for MI21 (FDR < 0.05): EBA, BL, BW, PH, DTS, DTA, LAE

## 
## --- Summary for MI21 ---
## # A tibble: 7 × 7
##   trait n_CTRL n_INV4M mean_CTRL mean_INV4M sd_CTRL sd_INV4M
##   <chr>  <int>   <int>     <dbl>      <dbl>   <dbl>    <dbl>
## 1 BL        16      16     58.3       55.3    1.49     1.39 
## 2 BW        16      16      8.78       8.14   0.32     0.321
## 3 DTA       16      16     77.7       77.3    0.615    0.552
## 4 DTS       16      16     78.4       77.6    0.638    0.66 
## 5 EBA       16      16    386.       338.    20.6     18.4  
## 6 LAE       16      16      5.89       6.02   0.157    0.172
## 7 PH        16      16    187.       183.     3.50     3.86 
## 
## === Overall Summary Across All Donors ===
## # A tibble: 8 × 4
##   trait n_donors_significant  min_pvalue max_pvalue
##   <chr>                <int>       <dbl>      <dbl>
## 1 DTA                      2 0.000000119 0.0498    
## 2 DTS                      2 0.000000207 0.00643   
## 3 EBA                      2 0.000000624 0.0206    
## 4 BW                       2 0.0000139   0.0127    
## 5 PH                       2 0.000230    0.00218   
## 6 LAE                      2 0.000231    0.0498    
## 7 BL                       1 0.00000734  0.00000734
## 8 EN                       1 0.000484    0.000484

9. Summary and export

# Print summary of analysis
cat("\n=== Analysis Summary ===\n")

## 
## === Analysis Summary ===

cat("Total phenotypes evaluated:", length(all_phenotypes), "\n")

## Total phenotypes evaluated: 9

cat("Usable phenotypes (<50% missing):", length(usable_phenotypes), "\n")

## Usable phenotypes (<50% missing): 9

cat("Successfully fitted models:", length(all_models), "\n")

## Successfully fitted models: 9

if (exists("sp_corrected") && nrow(sp_corrected) > 0) {
  cat("Plots with corrected data:", nrow(sp_corrected), "\n")
  
  # Export corrected data
  output_file <- "CLY2025_spatially_corrected_phenotypes.csv"
  write.csv(sp_corrected, output_file, row.names = FALSE)
  cat("Spatially corrected data exported to:", output_file, "\n")
} else {
  cat("No corrected data to export\n")
}

## Plots with corrected data: 64 
## Spatially corrected data exported to: CLY2025_spatially_corrected_phenotypes.csv

# Print model fit information
if (length(fit_info) > 0) {
  cat("\n=== Model Fit Details ===\n")
  for (trait in names(fit_info)) {
    info <- fit_info[[trait]]
    cat(sprintf("%s: n=%d, status=%s\n", trait, info$n_obs, info$status))
  }
}

## 
## === Model Fit Details ===
## DTA: n=64, status=success
## DTS: n=64, status=success
## LAE: n=64, status=success
## PH: n=64, status=success
## EN: n=64, status=success
## SL: n=64, status=success
## BL: n=64, status=success
## BW: n=64, status=success
## EBA: n=64, status=success