Flowering Time and Plant Height Analysis: Inv4m vs CTRL Effects

library(dplyr)
library(tidyr)
library(ggplot2)
library(ggbeeswarm)
library(ggpubr)
library(rstatix)
library(lubridate)
library(readr)
library(stringr)

# Read the data
pheno <- read_csv("~/Desktop/PSU2025_pheno.csv") %>%
janitor::clean_names()

metadata <- read_csv("~/Desktop/PSU2025_metadata.csv")%>%
janitor::clean_names()

# Planting date 
planting_date <- mdy("5/20/2025")

# Process the data
field_data <- pheno %>%
  separate(row_plant, into = c("rowid", "nested_plantid"), sep = "-", remove = FALSE) %>%
  # Parse genotype information
  mutate(
    rowid = as.integer(rowid),
    donor = case_when(
      str_detect(note, "TMEX") ~ "TMEX",
      str_detect(note, "MI21") ~ "MI21", 
      str_detect(note, "B73") ~ "B73",
      TRUE ~ "Unknown"
    ),
    inv4m = case_when(
      str_detect(note, "INV4M") ~ "INV4M",
      str_detect(note, "CTRL") ~ "CTRL",
      str_detect(note, "B73") ~ "B73",
      TRUE ~ "Unknown"
    )
  ) %>%
  # Parse dates and calculate days
  mutate(
    anthesis_date = mdy(anthesis),
    silking_date = mdy(silking),
    DTA = as.numeric(anthesis_date - planting_date),
    DTS = as.numeric(silking_date - planting_date)
  ) %>%
  # Filter to focus on experimental genotypes
  filter(inv4m %in% c("INV4M", "CTRL")) %>%
  select(rowid, nested_plantid, donor, inv4m,  DTA, DTS, PH=actual_height) %>%
  inner_join(metadata %>% select(rowid,x =x_row,y= y_range), by = "rowid")

# Display data structure
glimpse(field_data)

## Rows: 640
## Columns: 9
## $ rowid          <dbl> 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1…
## $ nested_plantid <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "1",…
## $ donor          <chr> "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX",…
## $ inv4m          <chr> "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "…
## $ DTA            <dbl> 76, NA, NA, 86, 80, 77, 77, 77, NA, NA, 77, 79, 78, 85,…
## $ DTS            <dbl> 76, 77, 77, 70, 81, 77, 78, 81, NA, NA, 76, 79, 76, 90,…
## $ PH             <dbl> 222, NA, NA, 263, 271, 268, NA, 260, NA, NA, 243, 243, …
## $ x              <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ y              <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,…

# Define custom mean ± 95% CI function
mean_ci_95 <- function(x) {
  x_clean <- na.omit(x)
  if(length(x_clean) < 2) return(c(y = NA, ymin = NA, ymax = NA))
  
  m <- mean(x_clean, na.rm = TRUE)
  se <- sd(x_clean, na.rm = TRUE) / sqrt(length(x_clean))
  ci <- qt(0.975, df = length(x_clean) - 1) * se
  return(c(y = m, ymin = m - ci, ymax = m + ci))
}

# Define generic plot function
plot_trait <- function(data, trait_name, y_label,
                       pal = c("INV4M" = "purple4", "CTRL" = "gold")) {
  
  # Filter out rows with missing trait values
  data_clean <- data  #%>% filter(!is.na(.data[[trait_name]]))
  
  if(nrow(data_clean) == 0) {
    warning(paste("No data available for", trait_name))
    return(ggplot() + theme_void())
  }
  
  # Compute t-tests and CI positioning
  stat <- data_clean %>%
    group_by(donor) %>%
    filter(n_distinct(inv4m) > 1, n() > 2) %>%  # Ensure we have both groups and enough data
    t_test(reformulate("inv4m", response = trait_name)) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance() %>%
    add_xy_position(x = "inv4m") 
  
  # Create base plot
  p <- ggplot(data_clean, aes_string(x = "inv4m", y = trait_name, color = "inv4m")) +
    # Add beeswarm points
    geom_beeswarm(
      size = 2.5, 
      alpha = 0.7,
      cex = 0.8
    ) +
        # Add mean ± 95% CI
    stat_summary(
      fun.data = mean_ci_95,  
      geom = "pointrange",
      position = position_dodge(width = 0.3),
      size = 0.8,
      color = "black"
    ) +
    facet_wrap(~donor, scales = "free_x") +
    scale_color_manual(values = pal) +
    labs(
      x = "Genotype",
      y = y_label,
      fill = "Genotype"
    ) +
    ggpubr::theme_classic2(base_size = 20) +
    theme(
      strip.background = element_blank(),
      strip.text = element_text(face = "bold"),
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "none"
    )
  

 
  
  # Add statistical annotations if we have stats
  if(nrow(stat) > 0) {
    p <- p + stat_pvalue_manual(
      stat,
      label = "p.adj",
      tip.length = 0.02,
      size = 4
    )
  }
  
  return(p)
}

# Generate summary statistics including plant height
summary_stats <- field_data %>%
  group_by(donor, inv4m) %>%
  summarise(
    n_plants = n(),
    dta_mean = mean(DTA, na.rm = TRUE),
    dta_sd = sd(DTA, na.rm = TRUE),
    dta_n = sum(!is.na(DTA)),
    dts_mean = mean(DTS, na.rm = TRUE),
    dts_sd = sd(DTS, na.rm = TRUE),
    dts_n = sum(!is.na(DTS)),
    ph_mean = mean(PH, na.rm = TRUE),
    ph_sd = sd(PH, na.rm = TRUE),
    ph_n = sum(!is.na(PH)),
    .groups = "drop"
  )

knitr::kable(summary_stats, digits = 2, caption = "Summary Statistics by Donor and Genotype")

Summary Statistics by Donor and Genotype
donor	inv4m	n_plants	dta_mean	dta_sd	dta_n	dts_mean	dts_sd	dts_n	ph_mean	ph_sd	ph_n
MI21	CTRL	160	77.80	2.52	131	78.22	2.49	131	232.62	10.23	131
MI21	INV4M	160	78.33	2.68	112	78.45	2.76	119	240.24	11.68	113
TMEX	CTRL	160	79.71	2.66	109	80.78	3.16	120	231.68	13.06	109
TMEX	INV4M	160	78.73	2.60	99	79.79	3.56	110	255.95	14.34	100

# Generate plots including plant height
p_dta <- plot_trait(field_data, "DTA", "Days to Anthesis")
p_dts <- plot_trait(field_data, "DTS", "Days to Silking") 
p_ph <- plot_trait(field_data, "PH", "Plant Height (cm)")

# Combine plots in a 1x3 layout
combined_plot <- ggarrange(
  p_dta, 
  p_dts,
  p_ph,
  ncol = 3, 
  align = "hv",
  common.legend = TRUE,
  legend = "bottom"
)

print(combined_plot)

# Perform statistical tests including plant height
if(sum(!is.na(field_data$DTA)) > 0) {
  dta_stats <- field_data %>%
    filter(!is.na(DTA)) %>%
    group_by(donor) %>%
    t_test(DTA ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("Days to Anthesis Statistical Tests:\n")
  print(dta_stats)
}

## Days to Anthesis Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df       p  p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>  <dbl>
## 1 MI21  DTA   CTRL   INV4M    131   112     -1.58  230. 0.116   0.232 
## 2 TMEX  DTA   CTRL   INV4M    109    99      2.69  205. 0.00784 0.0157
## # ℹ 1 more variable: p.adj.signif <chr>

if(sum(!is.na(field_data$DTS)) > 0) {
  dts_stats <- field_data %>%
    filter(!is.na(DTS)) %>%
    group_by(donor) %>%
    t_test(DTS ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("\nDays to Silking Statistical Tests:\n")
  print(dts_stats)
}

## 
## Days to Silking Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df      p  p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl>  <dbl>
## 1 MI21  DTS   CTRL   INV4M    131   119    -0.671  239. 0.503  1     
## 2 TMEX  DTS   CTRL   INV4M    120   110     2.23   219. 0.0268 0.0536
## # ℹ 1 more variable: p.adj.signif <chr>

if(sum(!is.na(field_data$PH)) > 0) {
  ph_stats <- field_data %>%
    filter(!is.na(PH)) %>%
    group_by(donor) %>%
    t_test(PH ~ inv4m) %>%
    adjust_pvalue(method = "bonferroni") %>%
    add_significance()
  
  cat("\nPlant Height Statistical Tests:\n")
  print(ph_stats)
}

## 
## Plant Height Statistical Tests:
## # A tibble: 2 × 11
##   donor .y.   group1 group2    n1    n2 statistic    df        p    p.adj
##   <chr> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl>
## 1 MI21  PH    CTRL   INV4M    131   113     -5.38  225. 1.85e- 7 3.7 e- 7
## 2 TMEX  PH    CTRL   INV4M    109   100    -12.8   201. 1.13e-27 2.26e-27
## # ℹ 1 more variable: p.adj.signif <chr>

Notes

Planting date assumed: 2025-05-20 (adjust the planting_date variable if different)
Color scheme: CTRL = gold, INV4M = purple4
Missing data: Rows with NA values are excluded from statistical analyses
Statistical tests: Two-sided t-tests with Bonferroni correction for multiple comparisons
Plot elements: Beeswarm points show individual data, boxplots show distribution, black points with error bars show mean ± 95% CI

Data Quality Notes

# Check for missing data patterns including plant height
missing_summary <- field_data %>%
  group_by(donor, inv4m) %>%
  summarise(
    total_plants = n(),
    missing_anthesis = sum(is.na(DTA)),
    missing_silking = sum(is.na(DTS)),
    missing_height = sum(is.na(PH)),
    .groups = "drop"
  )

knitr::kable(missing_summary, caption = "Missing Data Summary")

Missing Data Summary
donor	inv4m	total_plants	missing_anthesis	missing_silking	missing_height
MI21	CTRL	160	29	29	29
MI21	INV4M	160	48	41	47
TMEX	CTRL	160	51	40	51
TMEX	INV4M	160	61	50	60

Flowering Time and Plant Height Analysis: Inv4m vs CTRL Effects

2025-09-15

Notes

Data Quality Notes