library(lubridate)
library(lme4)
library(lmerTest)
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)

Analysis of Variance - AM5 & HM5

These ANOVA models quantify the extent of a Genotype × Quantile interaction at each time point (measured in days after transplanting, DAT).

# Note - the distributional data files are too large to be stored on GitHub. Once you download them, save them to their own directory and replace the value of data_path_server with that path.
# Replace data_path with the folder location where you downloaded all other data related to the repository.
# You can optionally specify a path to save RData when the loop completes one round of processing ANOVAs for all DATs associated with a specific vegetation index.

data_path <- 'D:/Users/aaron.desalvio/My Drive/TAMU/Murray Lab/Writing/Manuscripts/2025_Distributional_Data/CS24_Cotton/0.0_Data/'
data_path_server <- '//murray_lab/FLIGHTS2/Processed2/Texas/CS24/MULTI/CS24-STEL/Data_Products/Individual_VIs/'
RData_path <- 'C:/Users/aaron.desalvio/Downloads/Temp_RData/'
csvs <- list.files(path = data_path_server, pattern = '\\.csv$')
all_vis <- lapply(strsplit(as.character(csvs), '-'), '[', 6) %>% unlist()

# We will change the csvs vector to only contain one element to reduce the duration of producing this RMarkdown file. If you'd like to run the entire loop with all VIs, comment out the next line.
csvs <- csvs[1]

# We'll also only run a few DATs (time points) to speed up processing of this file. If you'd like to run the entire analysis, first comment out the next line, and then UNCOMMENT the line of code marked with # UNCOMMENT HERE below.
dat.loop <- c(107, 108, 109)

Loop to run ANOVA models with Genotype × Quantile interaction at all time points

# Initialize empty lists to store results
G_list <- list()
VC_list <- list()

for (i in 1:length(csvs)) {
  
  current_vi <- all_vis[i]
  print(paste('Current VI:', current_vi))
  
  # Read data from CSV files
  print(paste('Loading VI data frame for', current_vi))
  df <- fread(paste0(data_path_server, 'CS24-STEL-MULTI-RTK-Distributions-', current_vi, '-FieldCoords.csv')) %>% as.data.frame() %>% arrange(DAT)
  print(paste0('VI data frame loaded for ', current_vi, '!'))
  
  # Remove low-quality DATs
  DATs_exclude <- c(46, 71:72, 99:100)
  df <- df %>% filter(!DAT %in% DATs_exclude)
  
  # Remove plant that died early in the season
  df <- df %>% filter(!Row_Range == '413_15')
  
  # Basic plot for inspection of data
  sort(unique(df$DAT))
  df.plot <- df %>% filter(Pedigree == 5016, DAT == 140)
  plot(df.plot$probability, df.plot$quantile)
  
  # DAT vector for loop
  # dat.loop <- sort(unique(df$DAT)) # UNCOMMENT HERE
  
  # Loop over each environment
  for (dat in dat.loop) {
    
    # Subset data for the current DAT
    df.subset <- df[df$DAT == dat, ]
    
    # Convert variables to appropriate data types
    df.subset$Range <- as.factor(df.subset$Range)
    df.subset$Row <- as.factor(df.subset$Row)
    df.subset$Replicate <- as.factor(df.subset$Replicate)
    df.subset$Pedigree <- as.factor(df.subset$Pedigree)
    df.subset$probability <- as.factor(df.subset$probability)
    
    # Progress report
    print(paste('ANOVA Model:', 'DAT', dat, 'VI', current_vi))
    
    # Fit the linear mixed-effects model after setting "Inf" and NaN as NA
    response <- df.subset$quantile
    response[is.na(response) | response == 'Inf' | is.nan(response) | response == '-Inf'] <- NA
    
    anova <- lmer(response ~
                    (1 | Pedigree) +
                    (1 | probability) +
                    (1 | Pedigree:probability) +
                    (1 | Range:probability) +
                    (1 | Row:probability) +
                    (1 | Replicate:probability),
                  data = df.subset)
    
    # Calculate RMSE
    rmse_value <- sqrt(mean(residuals(anova)^2))
    
    # Calculate R-squared
    R <- MuMIn::r.squaredGLMM(anova)[, "R2c"]  # Conditional R-squared
    
    # Extract variance components
    VC <- as.data.frame(VarCorr(anova))
    VC$Percent <- round(VC$vcov / sum(VC$vcov) * 100, 2)
    
    # Calculate heritability
    nRep <- length(unique(df.subset$Replicate))
    nProb <- length(unique(df.subset$probability))
    Vg <- VC[VC$grp == "Pedigree", "vcov"]
    Vgp <- VC[VC$grp == "Pedigree:probability", "vcov"]
    Ve <- VC[VC$grp == "Residual", "vcov"]
    #heritability <- Vg / (Vg + (Vgp / nProb) + (Ve / (nRep * nProb)))
    #heritability <- round(heritability, 3)
    heritability <- Vg / (Vg + (Vgp) + (Ve / (nRep)))
    heritability <- round(heritability, 3)
    
    # Add additional information to VC data frame
    VC$Rmse <- rmse_value
    VC$Heritability <- heritability
    VC$R_squared <- R
    VC$Vegetation.Index <- current_vi
    VC$DAT <- dat
    print(VC)
    
    # Store VC in the list
    VC_list[[paste(dat, current_vi, sep = '_')]] <- VC
    
    # Extract random effects for 'Pedigree'
    G <- coef(anova)$Pedigree
    G$Pedigree <- rownames(G)
    
    # Store G in the list
    G_list[[paste(dat, current_vi, sep = '_')]] <- G
    
  }
  # Save RData for current loop iteration
  save.image(paste0(RData_path, 'ANOVA_Within_DAT_Across_Quantile_', current_vi, '.RData'))
}
## [1] "Current VI: BCC"
## [1] "Loading VI data frame for BCC"
## [1] "VI data frame loaded for BCC!"

## [1] "ANOVA Model: DAT 107 VI BCC"
##                     grp        var1 var2         vcov        sdcor Percent
## 1       Row:probability (Intercept) <NA> 2.627023e-05 0.0051254490    3.61
## 2 Replicate:probability (Intercept) <NA> 3.157604e-08 0.0001776965    0.00
## 3  Pedigree:probability (Intercept) <NA> 0.000000e+00 0.0000000000    0.00
## 4     Range:probability (Intercept) <NA> 1.980610e-06 0.0014073415    0.27
## 5           probability (Intercept) <NA> 6.350885e-04 0.0252009626   87.36
## 6              Pedigree (Intercept) <NA> 1.737181e-05 0.0041679508    2.39
## 7              Residual        <NA> <NA> 4.623500e-05 0.0067996325    6.36
##          Rmse Heritability R_squared Vegetation.Index DAT
## 1 0.006399464        0.919 0.9364011              BCC 107
## 2 0.006399464        0.919 0.9364011              BCC 107
## 3 0.006399464        0.919 0.9364011              BCC 107
## 4 0.006399464        0.919 0.9364011              BCC 107
## 5 0.006399464        0.919 0.9364011              BCC 107
## 6 0.006399464        0.919 0.9364011              BCC 107
## 7 0.006399464        0.919 0.9364011              BCC 107
## [1] "ANOVA Model: DAT 108 VI BCC"
##                     grp        var1 var2         vcov       sdcor Percent
## 1       Row:probability (Intercept) <NA> 6.388885e-05 0.007993050    5.67
## 2 Replicate:probability (Intercept) <NA> 3.885413e-06 0.001971145    0.34
## 3  Pedigree:probability (Intercept) <NA> 1.792040e-06 0.001338671    0.16
## 4     Range:probability (Intercept) <NA> 2.949287e-06 0.001717349    0.26
## 5           probability (Intercept) <NA> 9.326327e-04 0.030539035   82.73
## 6              Pedigree (Intercept) <NA> 1.925773e-05 0.004388363    1.71
## 7              Residual        <NA> <NA> 1.029599e-04 0.010146916    9.13
##          Rmse Heritability R_squared Vegetation.Index DAT
## 1 0.009417045        0.787 0.9086721              BCC 108
## 2 0.009417045        0.787 0.9086721              BCC 108
## 3 0.009417045        0.787 0.9086721              BCC 108
## 4 0.009417045        0.787 0.9086721              BCC 108
## 5 0.009417045        0.787 0.9086721              BCC 108
## 6 0.009417045        0.787 0.9086721              BCC 108
## 7 0.009417045        0.787 0.9086721              BCC 108
## [1] "ANOVA Model: DAT 109 VI BCC"
##                     grp        var1 var2         vcov       sdcor Percent
## 1       Row:probability (Intercept) <NA> 3.712641e-05 0.006093145    4.96
## 2 Replicate:probability (Intercept) <NA> 1.362821e-06 0.001167399    0.18
## 3  Pedigree:probability (Intercept) <NA> 0.000000e+00 0.000000000    0.00
## 4     Range:probability (Intercept) <NA> 2.276986e-06 0.001508969    0.30
## 5           probability (Intercept) <NA> 6.429619e-04 0.025356693   85.87
## 6              Pedigree (Intercept) <NA> 1.546753e-05 0.003932878    2.07
## 7              Residual        <NA> <NA> 4.957899e-05 0.007041235    6.62
##          Rmse Heritability R_squared Vegetation.Index DAT
## 1 0.006565553        0.903 0.9337865              BCC 109
## 2 0.006565553        0.903 0.9337865              BCC 109
## 3 0.006565553        0.903 0.9337865              BCC 109
## 4 0.006565553        0.903 0.9337865              BCC 109
## 5 0.006565553        0.903 0.9337865              BCC 109
## 6 0.006565553        0.903 0.9337865              BCC 109
## 7 0.006565553        0.903 0.9337865              BCC 109

Save the ANOVA results

G_df <- do.call(rbind, G_list)
VC_df <- do.call(rbind, VC_list)

# View the first few rows of the results
head(G_df)
##              (Intercept) Pedigree
## 107_BCC.5001   0.3339126     5001
## 107_BCC.5002   0.3372073     5002
## 107_BCC.5003   0.3328357     5003
## 107_BCC.5004   0.3340158     5004
## 107_BCC.5005   0.3314073     5005
## 107_BCC.5006   0.3390244     5006
head(VC_df)
##                             grp        var1 var2         vcov        sdcor
## 107_BCC.1       Row:probability (Intercept) <NA> 2.627023e-05 0.0051254490
## 107_BCC.2 Replicate:probability (Intercept) <NA> 3.157604e-08 0.0001776965
## 107_BCC.3  Pedigree:probability (Intercept) <NA> 0.000000e+00 0.0000000000
## 107_BCC.4     Range:probability (Intercept) <NA> 1.980610e-06 0.0014073415
## 107_BCC.5           probability (Intercept) <NA> 6.350885e-04 0.0252009626
## 107_BCC.6              Pedigree (Intercept) <NA> 1.737181e-05 0.0041679508
##           Percent        Rmse Heritability R_squared Vegetation.Index DAT
## 107_BCC.1    3.61 0.006399464        0.919 0.9364011              BCC 107
## 107_BCC.2    0.00 0.006399464        0.919 0.9364011              BCC 107
## 107_BCC.3    0.00 0.006399464        0.919 0.9364011              BCC 107
## 107_BCC.4    0.27 0.006399464        0.919 0.9364011              BCC 107
## 107_BCC.5   87.36 0.006399464        0.919 0.9364011              BCC 107
## 107_BCC.6    2.39 0.006399464        0.919 0.9364011              BCC 107
# Data wrangling before exporting
VC_df_cols <- colnames(VC_df)
VC_df_cols <- VC_df_cols[!VC_df_cols %in% c('var1', 'var2')]
VC_df <- VC_df[,VC_df_cols]
VC_df$DAT_Prob_VI <- rownames(VC_df)

colnames(G_df)[1] <- 'BLUP'
G_df$DAT_VI.Pedigree <- rownames(G_df)
G_df$VI.Pedigree <- lapply(strsplit(as.character(G_df$DAT_VI.Pedigree), '_'), '[', 2) %>% unlist()
G_df$Vegetation.Index <- lapply(strsplit(as.character(G_df$VI.Pedigree), '\\.'), '[', 1)

# Save the data frames
#fwrite(VC_df, paste0(data_path, 'VarComp_Within_DAT_Across_Quantile_Revised_Herit.csv'))
#fwrite(G_df, paste0(data_path, 'Quantile_BLUPs_Within_DAT_Across_Quantile_Revised_Herit.csv'))