library(dplyr)
library(tidyr)
library(ggplot2)
library(ggbeeswarm)
library(ggpubr)
library(rstatix)
library(lubridate)
library(readr)
library(stringr)
# Read the data
pheno <- read_csv("~/Desktop/PSU2025_pheno.csv") %>%
janitor::clean_names()
metadata <- read_csv("~/Desktop/PSU2025_metadata.csv")%>%
janitor::clean_names()
# Planting date
planting_date <- mdy("5/20/2025")
# Process the data
field_data <- pheno %>%
separate(row_plant, into = c("rowid", "nested_plantid"), sep = "-", remove = FALSE) %>%
# Parse genotype information
mutate(
rowid = as.integer(rowid),
donor = case_when(
str_detect(note, "TMEX") ~ "TMEX",
str_detect(note, "MI21") ~ "MI21",
str_detect(note, "B73") ~ "B73",
TRUE ~ "Unknown"
),
inv4m = case_when(
str_detect(note, "INV4M") ~ "INV4M",
str_detect(note, "CTRL") ~ "CTRL",
str_detect(note, "B73") ~ "B73",
TRUE ~ "Unknown"
)
) %>%
# Parse dates and calculate days
mutate(
anthesis_date = mdy(anthesis),
silking_date = mdy(silking),
DTA = as.numeric(anthesis_date - planting_date),
DTS = as.numeric(silking_date - planting_date)
) %>%
# Filter to focus on experimental genotypes
filter(inv4m %in% c("INV4M", "CTRL")) %>%
select(rowid, nested_plantid, donor, inv4m, DTA, DTS, PH=actual_height) %>%
inner_join(metadata %>% select(rowid,x =x_row,y= y_range), by = "rowid")
# Display data structure
glimpse(field_data)
## Rows: 640
## Columns: 9
## $ rowid <dbl> 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1…
## $ nested_plantid <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "1",…
## $ donor <chr> "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX", "TMEX",…
## $ inv4m <chr> "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "INV4M", "…
## $ DTA <dbl> 76, NA, NA, 86, 80, 77, 77, 77, NA, NA, 77, 79, 78, 85,…
## $ DTS <dbl> 76, 77, 77, 70, 81, 77, 78, 81, NA, NA, 76, 79, 76, 90,…
## $ PH <dbl> 222, NA, NA, 263, 271, 268, NA, 260, NA, NA, 243, 243, …
## $ x <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ y <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,…
# Define custom mean ± 95% CI function
mean_ci_95 <- function(x) {
x_clean <- na.omit(x)
if(length(x_clean) < 2) return(c(y = NA, ymin = NA, ymax = NA))
m <- mean(x_clean, na.rm = TRUE)
se <- sd(x_clean, na.rm = TRUE) / sqrt(length(x_clean))
ci <- qt(0.975, df = length(x_clean) - 1) * se
return(c(y = m, ymin = m - ci, ymax = m + ci))
}
# Define generic plot function
plot_trait <- function(data, trait_name, y_label,
pal = c("INV4M" = "purple4", "CTRL" = "gold")) {
# Filter out rows with missing trait values
data_clean <- data #%>% filter(!is.na(.data[[trait_name]]))
if(nrow(data_clean) == 0) {
warning(paste("No data available for", trait_name))
return(ggplot() + theme_void())
}
# Compute t-tests and CI positioning
stat <- data_clean %>%
group_by(donor) %>%
filter(n_distinct(inv4m) > 1, n() > 2) %>% # Ensure we have both groups and enough data
t_test(reformulate("inv4m", response = trait_name)) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance() %>%
add_xy_position(x = "inv4m")
# Create base plot
p <- ggplot(data_clean, aes_string(x = "inv4m", y = trait_name, color = "inv4m")) +
# Add beeswarm points
geom_beeswarm(
size = 2.5,
alpha = 0.7,
cex = 0.8
) +
# Add mean ± 95% CI
stat_summary(
fun.data = mean_ci_95,
geom = "pointrange",
position = position_dodge(width = 0.3),
size = 0.8,
color = "black"
) +
facet_wrap(~donor, scales = "free_x") +
scale_color_manual(values = pal) +
labs(
x = "Genotype",
y = y_label,
fill = "Genotype"
) +
ggpubr::theme_classic2(base_size = 20) +
theme(
strip.background = element_blank(),
strip.text = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
# Add statistical annotations if we have stats
if(nrow(stat) > 0) {
p <- p + stat_pvalue_manual(
stat,
label = "p.adj",
tip.length = 0.02,
size = 4
)
}
return(p)
}
# Generate summary statistics including plant height
summary_stats <- field_data %>%
group_by(donor, inv4m) %>%
summarise(
n_plants = n(),
dta_mean = mean(DTA, na.rm = TRUE),
dta_sd = sd(DTA, na.rm = TRUE),
dta_n = sum(!is.na(DTA)),
dts_mean = mean(DTS, na.rm = TRUE),
dts_sd = sd(DTS, na.rm = TRUE),
dts_n = sum(!is.na(DTS)),
ph_mean = mean(PH, na.rm = TRUE),
ph_sd = sd(PH, na.rm = TRUE),
ph_n = sum(!is.na(PH)),
.groups = "drop"
)
knitr::kable(summary_stats, digits = 2, caption = "Summary Statistics by Donor and Genotype")
Summary Statistics by Donor and Genotype
MI21 |
CTRL |
160 |
77.80 |
2.52 |
131 |
78.22 |
2.49 |
131 |
232.62 |
10.23 |
131 |
MI21 |
INV4M |
160 |
78.33 |
2.68 |
112 |
78.45 |
2.76 |
119 |
240.24 |
11.68 |
113 |
TMEX |
CTRL |
160 |
79.71 |
2.66 |
109 |
80.78 |
3.16 |
120 |
231.68 |
13.06 |
109 |
TMEX |
INV4M |
160 |
78.73 |
2.60 |
99 |
79.79 |
3.56 |
110 |
255.95 |
14.34 |
100 |
# Generate plots including plant height
p_dta <- plot_trait(field_data, "DTA", "Days to Anthesis")
p_dts <- plot_trait(field_data, "DTS", "Days to Silking")
p_ph <- plot_trait(field_data, "PH", "Plant Height (cm)")
# Combine plots in a 1x3 layout
combined_plot <- ggarrange(
p_dta,
p_dts,
p_ph,
ncol = 3,
align = "hv",
common.legend = TRUE,
legend = "bottom"
)
print(combined_plot)

# Perform statistical tests including plant height
if(sum(!is.na(field_data$DTA)) > 0) {
dta_stats <- field_data %>%
filter(!is.na(DTA)) %>%
group_by(donor) %>%
t_test(DTA ~ inv4m) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
cat("Days to Anthesis Statistical Tests:\n")
print(dta_stats)
}
## Days to Anthesis Statistical Tests:
## # A tibble: 2 × 11
## donor .y. group1 group2 n1 n2 statistic df p p.adj
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 MI21 DTA CTRL INV4M 131 112 -1.58 230. 0.116 0.232
## 2 TMEX DTA CTRL INV4M 109 99 2.69 205. 0.00784 0.0157
## # ℹ 1 more variable: p.adj.signif <chr>
if(sum(!is.na(field_data$DTS)) > 0) {
dts_stats <- field_data %>%
filter(!is.na(DTS)) %>%
group_by(donor) %>%
t_test(DTS ~ inv4m) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
cat("\nDays to Silking Statistical Tests:\n")
print(dts_stats)
}
##
## Days to Silking Statistical Tests:
## # A tibble: 2 × 11
## donor .y. group1 group2 n1 n2 statistic df p p.adj
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 MI21 DTS CTRL INV4M 131 119 -0.671 239. 0.503 1
## 2 TMEX DTS CTRL INV4M 120 110 2.23 219. 0.0268 0.0536
## # ℹ 1 more variable: p.adj.signif <chr>
if(sum(!is.na(field_data$PH)) > 0) {
ph_stats <- field_data %>%
filter(!is.na(PH)) %>%
group_by(donor) %>%
t_test(PH ~ inv4m) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
cat("\nPlant Height Statistical Tests:\n")
print(ph_stats)
}
##
## Plant Height Statistical Tests:
## # A tibble: 2 × 11
## donor .y. group1 group2 n1 n2 statistic df p p.adj
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 MI21 PH CTRL INV4M 131 113 -5.38 225. 1.85e- 7 3.7 e- 7
## 2 TMEX PH CTRL INV4M 109 100 -12.8 201. 1.13e-27 2.26e-27
## # ℹ 1 more variable: p.adj.signif <chr>
Notes
- Planting date assumed: 2025-05-20 (adjust the
planting_date
variable if different)
- Color scheme: CTRL = gold, INV4M = purple4
- Missing data: Rows with NA values are excluded from
statistical analyses
- Statistical tests: Two-sided t-tests with
Bonferroni correction for multiple comparisons
- Plot elements: Beeswarm points show individual
data, boxplots show distribution, black points with error bars show mean
± 95% CI
Data Quality Notes
# Check for missing data patterns including plant height
missing_summary <- field_data %>%
group_by(donor, inv4m) %>%
summarise(
total_plants = n(),
missing_anthesis = sum(is.na(DTA)),
missing_silking = sum(is.na(DTS)),
missing_height = sum(is.na(PH)),
.groups = "drop"
)
knitr::kable(missing_summary, caption = "Missing Data Summary")
Missing Data Summary
MI21 |
CTRL |
160 |
29 |
29 |
29 |
MI21 |
INV4M |
160 |
48 |
41 |
47 |
TMEX |
CTRL |
160 |
51 |
40 |
51 |
TMEX |
INV4M |
160 |
61 |
50 |
60 |