library(dplyr)
library(tidyr)
library(ggplot2)
library(ggbeeswarm)
library(ggpubr)
library(rstatix)
library(lubridate)
library(readr)
library(stringr)
# Read the data
pheno <- read_csv("~/Desktop/PSU2025_pheno.csv") %>%
janitor::clean_names()

metadata <- read_csv("~/Desktop/PSU2025_metadata.csv")%>%
janitor::clean_names()

# Planting date 
planting_date <- mdy("5/20/2025")

# Process the data
field_data <- pheno %>%
  separate(row_plant, into = c("rowid", "nested_plantid"), sep = "-", remove = FALSE) %>%
  # Parse genotype information
  mutate(
    rowid = as.integer(rowid),
    donor = case_when(
      str_detect(note, "TMEX") ~ "TMEX",
      str_detect(note, "MI21") ~ "MI21", 
      str_detect(note, "B73") ~ "B73",
      TRUE ~ "Unknown"
    ),
    inv4m = case_when(
      str_detect(note, "INV4M") ~ "INV4M",
      str_detect(note, "CTRL") ~ "CTRL",
      str_detect(note, "B73") ~ "B73",
      TRUE ~ "Unknown"
    )
  ) %>%
  # Parse dates and calculate days
  mutate(
    anthesis_date = mdy(anthesis),
    silking_date = mdy(silking),
    DTA = as.numeric(anthesis_date - planting_date),
    DTS = as.numeric(silking_date - planting_date)
  ) %>%
  # Filter to focus on experimental genotypes
  filter(inv4m %in% c("INV4M", "CTRL")) %>%
  select(rowid, nested_plantid, donor, inv4m,  DTA, DTS, PH=actual_height) %>%
  inner_join(metadata %>% select(rowid,x =x_row,y= y_range), by = "rowid")

# Display data structure
glimpse(field_data)
## Rows: 640
## Columns: 9
## $ rowid          <dbl> 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1…
## $ nested_plantid <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "1",…
## $ donor          <chr> "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX",…
## $ inv4m          <chr> "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "…
## $ DTA            <dbl> 76, NA, NA, 86, 80, 77, 77, 77, NA, NA, 77, 79, 78, 85,…
## $ DTS            <dbl> 76, 77, 77, 70, 81, 77, 78, 81, NA, NA, 76, 79, 76, 90,…
## $ PH             <dbl> 222, NA, NA, 263, 271, 268, NA, 260, NA, NA, 243, 243, …
## $ x              <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ y              <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,…
# Define custom mean ± 95% CI function
mean_ci_95 <- function(x) {
  x_clean <- na.omit(x)
  if(length(x_clean) < 2) return(c(y = NA, ymin = NA, ymax = NA))
  
  m <- mean(x_clean, na.rm = TRUE)
  se <- sd(x_clean, na.rm = TRUE) / sqrt(length(x_clean))
  ci <- qt(0.975, df = length(x_clean) - 1) * se
  return(c(y = m, ymin = m - ci, ymax = m + ci))
}

# Define generic plot function
plot_trait <- function(data, trait_name, y_label,
                       pal = c("INV4M" = "purple4", "CTRL" = "gold")) {
  
  # Filter out rows with missing trait values
  data_clean <- data  #%>% filter(!is.na(.data[[trait_name]]))
  
  if(nrow(data_clean) == 0) {
    warning(paste("No data available for", trait_name))
    return(ggplot() + theme_void())
  }
  
  # Compute t-tests and CI positioning
  stat <- data_clean %>%
    group_by(donor) %>%
    filter(n_distinct(inv4m) > 1, n() > 2) %>%  # Ensure we have both groups and enough data
    t_test(reformulate("inv4m", response = trait_name)) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance() %>%
    add_xy_position(x = "inv4m") 
  
  # Create base plot
  p <- ggplot(data_clean, aes_string(x = "inv4m", y = trait_name, color = "inv4m")) +
    # Add beeswarm points
    geom_beeswarm(
      size = 2.5, 
      alpha = 0.7,
      cex = 0.8
    ) +
        # Add mean ± 95% CI
    stat_summary(
      fun.data = mean_ci_95,  
      geom = "pointrange",
      position = position_dodge(width = 0.3),
      size = 0.8,
      color = "black"
    ) +
    facet_wrap(~donor, scales = "free_x") +
    scale_color_manual(values = pal) +
    labs(
      x = "Genotype",
      y = y_label,
      fill = "Genotype"
    ) +
    ggpubr::theme_classic2(base_size = 20) +
    theme(
      strip.background = element_blank(),
      strip.text = element_text(face = "bold"),
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "none"
    )
  

 
  
  # Add statistical annotations if we have stats
  if(nrow(stat) > 0) {
    p <- p + stat_pvalue_manual(
      stat,
      label = "p.adj",
      tip.length = 0.02,
      size = 4
    )
  }
  
  return(p)
}
# Generate summary statistics including plant height
summary_stats <- field_data %>%
  group_by(donor, inv4m) %>%
  summarise(
    n_plants = n(),
    dta_mean = mean(DTA, na.rm = TRUE),
    dta_sd = sd(DTA, na.rm = TRUE),
    dta_n = sum(!is.na(DTA)),
    dts_mean = mean(DTS, na.rm = TRUE),
    dts_sd = sd(DTS, na.rm = TRUE),
    dts_n = sum(!is.na(DTS)),
    ph_mean = mean(PH, na.rm = TRUE),
    ph_sd = sd(PH, na.rm = TRUE),
    ph_n = sum(!is.na(PH)),
    .groups = "drop"
  )

knitr::kable(summary_stats, digits = 2, caption = "Summary Statistics by Donor and Genotype")
Summary Statistics by Donor and Genotype
donor inv4m n_plants dta_mean dta_sd dta_n dts_mean dts_sd dts_n ph_mean ph_sd ph_n
MI21 CTRL 160 77.80 2.52 131 78.22 2.49 131 232.62 10.23 131
MI21 INV4M 160 78.33 2.68 112 78.45 2.76 119 240.24 11.68 113
TMEX CTRL 160 79.71 2.66 109 80.78 3.16 120 231.68 13.06 109
TMEX INV4M 160 78.73 2.60 99 79.79 3.56 110 255.95 14.34 100
# Generate plots including plant height
p_dta <- plot_trait(field_data, "DTA", "Days to Anthesis")
p_dts <- plot_trait(field_data, "DTS", "Days to Silking") 
p_ph <- plot_trait(field_data, "PH", "Plant Height (cm)")

# Combine plots in a 1x3 layout
combined_plot <- ggarrange(
  p_dta, 
  p_dts,
  p_ph,
  ncol = 3, 
  align = "hv",
  common.legend = TRUE,
  legend = "bottom"
)

print(combined_plot)

# Perform statistical tests including plant height
if(sum(!is.na(field_data$DTA)) > 0) {
  dta_stats <- field_data %>%
    filter(!is.na(DTA)) %>%
    group_by(donor) %>%
    t_test(DTA ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("Days to Anthesis Statistical Tests:\n")
  print(dta_stats)
}
## Days to Anthesis Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df       p  p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>  <dbl>
## 1 MI21  DTA   CTRL   INV4M    131   112     -1.58  230. 0.116   0.232 
## 2 TMEX  DTA   CTRL   INV4M    109    99      2.69  205. 0.00784 0.0157
## # ℹ 1 more variable: p.adj.signif <chr>
if(sum(!is.na(field_data$DTS)) > 0) {
  dts_stats <- field_data %>%
    filter(!is.na(DTS)) %>%
    group_by(donor) %>%
    t_test(DTS ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("\nDays to Silking Statistical Tests:\n")
  print(dts_stats)
}
## 
## Days to Silking Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df      p  p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl>  <dbl>
## 1 MI21  DTS   CTRL   INV4M    131   119    -0.671  239. 0.503  1     
## 2 TMEX  DTS   CTRL   INV4M    120   110     2.23   219. 0.0268 0.0536
## # ℹ 1 more variable: p.adj.signif <chr>
if(sum(!is.na(field_data$PH)) > 0) {
  ph_stats <- field_data %>%
    filter(!is.na(PH)) %>%
    group_by(donor) %>%
    t_test(PH ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("\nPlant Height Statistical Tests:\n")
  print(ph_stats)
}
## 
## Plant Height Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df        p    p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl>
## 1 MI21  PH    CTRL   INV4M    131   113     -5.38  225. 1.85e- 7 3.7 e- 7
## 2 TMEX  PH    CTRL   INV4M    109   100    -12.8   201. 1.13e-27 2.26e-27
## # ℹ 1 more variable: p.adj.signif <chr>

Notes

Data Quality Notes

# Check for missing data patterns including plant height
missing_summary <- field_data %>%
  group_by(donor, inv4m) %>%
  summarise(
    total_plants = n(),
    missing_anthesis = sum(is.na(DTA)),
    missing_silking = sum(is.na(DTS)),
    missing_height = sum(is.na(PH)),
    .groups = "drop"
  )

knitr::kable(missing_summary, caption = "Missing Data Summary")
Missing Data Summary
donor inv4m total_plants missing_anthesis missing_silking missing_height
MI21 CTRL 160 29 29 29
MI21 INV4M 160 48 41 47
TMEX CTRL 160 51 40 51
TMEX INV4M 160 61 50 60