library(lubridate)
library(lme4)
library(lmerTest)
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)

Analysis of Variance - AM1 & HM1

The following ANOVA models are primarily aimed at quantifying the degree of Genotype x Time interaction present in the vegetation indices. We’re using “standard” VIs, meaning only the median value is being tested.

# Set working directory and source external functions
# Replace this path with the actual path where you saved the repository's files
data_path <- 'D:/Users/aaron.desalvio/My Drive/TAMU/Murray Lab/Writing/Manuscripts/2025_Distributional_Data/CS24_Cotton/0.0_Data/'

# Read data from CSV files
df <- data.table::fread(paste0(data_path, 'CS24-STEL-MULTI-RTK-VIs-FieldCoords.csv')) %>% as.data.frame() %>% arrange(DAT)

# Remove low-quality DATs
DATs_exclude <- c(46, 71:72, 99:100)
df <- df %>% filter(!DAT %in% DATs_exclude)

# Remove plant that died early in the season
df <- df %>% filter(!Row_Range == '413_15')

# Basic plot for inspection of data
sort(unique(df$DAT))
##   [1]  35  36  37  38  39  40  41  42  43  44  45  47  48  49  50  51  52  53
##  [19]  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  73
##  [37]  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
##  [55]  92  93  94  95  96  97  98 101 102 103 104 105 106 107 108 109 110 111
##  [73] 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
##  [91] 130 131 132 133 134 135 136 137 138 139 140
df.plot <- df %>% filter(Pedigree == 5001)
plot(df.plot$DAT, df.plot$NDVI_mean, ylim = c(0.0, 1.0))

# Vector of VI names for loop
VI.Names <- sort(unique(colnames(df)[19:ncol(df)]))

# For the purposes of this document, we'll only perform one iteration of the loop.
# Comment out the following line if you'd like to run the entire loop
VI.Names <- 'NGRDI_median'

Loop to run ANOVA models for each VI

# Initialize empty lists to store results
G_list <- list()
VC_list <- list()

for (vi.name in VI.Names) {
    
    # Subset data for the current VI
    df.subset <- df %>% select(all_of(c('Range', 'Row', 'Replicate', 'Pedigree', 'DAT', vi.name)))
    
    # Convert variables to appropriate data types
    df.subset$Range <- as.factor(df.subset$Range)
    df.subset$Row <- as.factor(df.subset$Row)
    df.subset$Replicate <- as.factor(df.subset$Replicate)
    df.subset$Pedigree <- as.factor(df.subset$Pedigree)
    df.subset$DAT <- as.factor(df.subset$DAT)
      
    # Progress report
    print(paste('ANOVA Model:', vi.name))
      
    # Fit the linear mixed-effects model after setting "Inf" and NaN as NA
    response <- df.subset[,vi.name]
    response[is.na(response) | response == 'Inf' | is.nan(response) | response == '-Inf'] <- NA
      
    anova <- lmer(response ~
                    (1 | Pedigree) +
                    (1 | DAT) +
                    (1 | Pedigree:DAT) +
                    (1 | Range:DAT) +
                    (1 | Row:DAT) +
                    (1 | Replicate:DAT),
                  data = df.subset)
      
    # Calculate RMSE
    rmse_value <- sqrt(mean(residuals(anova)^2))
      
    # Calculate R-squared
    R <- MuMIn::r.squaredGLMM(anova)[, "R2c"]  # Conditional R-squared
      
    # Extract variance components
    VC <- as.data.frame(VarCorr(anova))
    VC$Percent <- round(VC$vcov / sum(VC$vcov) * 100, 2)
      
    # Calculate heritability
    nRep <- length(unique(df.subset$Replicate))
    nFlight <- length(unique(df.subset$DAT))
    Vg <- VC[VC$grp == "Pedigree", "vcov"]
    Vgf <- VC[VC$grp == "Pedigree:DAT", "vcov"]
    Ve <- VC[VC$grp == "Residual", "vcov"]
    #heritability <- Vg / (Vg + (Vgf / nFlight) + (Ve / (nRep * nFlight)))
    #heritability <- round(heritability, 3)
    heritability <- Vg / (Vg + (Vgf) + (Ve / (nRep)))
    heritability <- round(heritability, 3)
      
    # Add additional information to VC data frame
    VC$Rmse <- rmse_value
    VC$Heritability <- heritability
    VC$R_squared <- R
    VC$Vegetation.Index <- vi.name
    print(VC)
      
    # Store VC in the list
    VC_list[[vi.name]] <- VC
      
    # Extract random effects for 'Pedigree'
    G <- coef(anova)$Pedigree
    G$Pedigree <- rownames(G)
      
    # Store G in the list
    G_list[[vi.name]] <- G
      
}
## [1] "ANOVA Model: NGRDI_median"
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00357318 (tol = 0.002, component 1)
##             grp        var1 var2         vcov       sdcor Percent       Rmse
## 1       Row:DAT (Intercept) <NA> 4.839831e-04 0.021999617    9.53 0.02931909
## 2 Replicate:DAT (Intercept) <NA> 1.935293e-05 0.004399197    0.38 0.02931909
## 3  Pedigree:DAT (Intercept) <NA> 6.108809e-05 0.007815887    1.20 0.02931909
## 4     Range:DAT (Intercept) <NA> 4.429150e-05 0.006655186    0.87 0.02931909
## 5           DAT (Intercept) <NA> 3.290852e-03 0.057365946   64.81 0.02931909
## 6      Pedigree (Intercept) <NA> 1.785007e-04 0.013360414    3.52 0.02931909
## 7      Residual        <NA> <NA> 9.995879e-04 0.031616261   19.69 0.02931909
##   Heritability R_squared Vegetation.Index
## 1        0.654 0.8031399     NGRDI_median
## 2        0.654 0.8031399     NGRDI_median
## 3        0.654 0.8031399     NGRDI_median
## 4        0.654 0.8031399     NGRDI_median
## 5        0.654 0.8031399     NGRDI_median
## 6        0.654 0.8031399     NGRDI_median
## 7        0.654 0.8031399     NGRDI_median

Save the ANOVA results

# Combine the lists into data frames
G_df <- rbindlist(G_list, fill = T) %>% as.data.frame()
VC_df <- rbindlist(VC_list, fill = T) %>% as.data.frame()

# View the first few rows of the results
head(G_df)
##   (Intercept) Pedigree
## 1   0.2495714     5001
## 2   0.2552577     5002
## 3   0.2531053     5003
## 4   0.2560163     5004
## 5   0.2573020     5005
## 6   0.2550449     5006
head(VC_df)
##             grp        var1 var2         vcov       sdcor Percent       Rmse
## 1       Row:DAT (Intercept) <NA> 4.839831e-04 0.021999617    9.53 0.02931909
## 2 Replicate:DAT (Intercept) <NA> 1.935293e-05 0.004399197    0.38 0.02931909
## 3  Pedigree:DAT (Intercept) <NA> 6.108809e-05 0.007815887    1.20 0.02931909
## 4     Range:DAT (Intercept) <NA> 4.429150e-05 0.006655186    0.87 0.02931909
## 5           DAT (Intercept) <NA> 3.290852e-03 0.057365946   64.81 0.02931909
## 6      Pedigree (Intercept) <NA> 1.785007e-04 0.013360414    3.52 0.02931909
##   Heritability R_squared Vegetation.Index
## 1        0.654 0.8031399     NGRDI_median
## 2        0.654 0.8031399     NGRDI_median
## 3        0.654 0.8031399     NGRDI_median
## 4        0.654 0.8031399     NGRDI_median
## 5        0.654 0.8031399     NGRDI_median
## 6        0.654 0.8031399     NGRDI_median
# Data wrangling before exporting
VC_df_cols <- colnames(VC_df)
VC_df_cols <- VC_df_cols[!VC_df_cols %in% c('var1', 'var2')]
VC_df <- VC_df[,VC_df_cols]

colnames(G_df)[1] <- 'VI.BLUP'
VI_vec <- rep(VI.Names, each = length(unique(df.subset$Pedigree)))
length(VI_vec) == nrow(G_df)
## [1] TRUE
G_df$Vegetation.Index <- VI_vec
G_df$Vegetation.Index.Type <- lapply(strsplit(as.character(G_df$Vegetation.Index), '_'), '[', 2) %>% unlist()

# Save the data frames
#fwrite(VC_df, paste0(data_path, 'VarComp_Across_All_DATs_Standard_VIs_Revised_Herit.csv'))
#fwrite(G_df, paste0(data_path, 'Quantile_BLUPs_Within_DAT_Standard_VIs_Revised_Herit.csv'))