Non-destructive, whole-plant phenotyping data analysis

#Load necessary libraries
library(readxl)
library(rmarkdown)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(pysd2r)
library(varhandle)
library(ggbeeswarm)
library(broom)
library(multcompView)
library(multcomp)
library(ggpubr)
library(rcompanion)
library(emmeans)

#Set working directory
setwd("/Users/danny/whole_plant_phenotyping/")

#load data into R
data <- read_xlsx("master_file.xlsx")

#convert variables to factors, and order factors as desired
data$Genotype <- factor(data$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))
data$Days_of_measurement <- factor(data$Days_of_measurement)
data$Treatment <- factor(data$Treatment, levels = c("Control", "Mannitol"))
data$Sample_ID <- factor(data$Sample_ID)

#view data
paged_table(data)

#assign custom colors to each treatment
treatment_colors <- c("#56b48b", "#E69F00")

#Make linegraphs of average values per day and genotype, separated by treatment
biomass_lineplot <- ggplot(data, aes(x = Days_of_measurement, y = Cumulative_biomass_accumulation)) +
  #draw 95% confidence interval around average value for each day and treatment
  stat_summary(fun.data = mean_cl_normal, geom = "ribbon", linetype = 0, aes(group = Treatment), alpha = 0.25) +
  #draw line connecting average values per genotype and treatment by day
  stat_summary(fun = mean, aes(group = Treatment, color = Treatment), size = 0.9, geom = "line", linetype = "solid") +
  #draw lines for each individual sample and make them somewhat transparent for visual clarity
  stat_summary(fun = mean, aes(group = Sample_ID, color = Treatment), size = 0.45, alpha = 0.2, geom = "line", linetype = "solid") +
  #group by genotype
  facet_grid(.~ Genotype) +
  #add custom colors for each treatment
  scale_color_manual(values = treatment_colors) +
  scale_fill_manual(values = treatment_colors) +
  theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "transparent")) +
  theme(panel.grid.major = element_line(color = 'transparent')) +
  theme(panel.grid.minor = element_line(color = "transparent")) +
  theme(axis.line = element_line(color = "black", size = 0.65)) +
  theme(axis.ticks.length = unit(1.5, "mm")) +
  theme(axis.ticks = element_line(size = 0.75, color = "black")) +
  theme(axis.title = element_text(size = 12)) +
  theme(axis.text = element_text(colour = "black", size = 12)) +
  ylab("Average biomass accumulation (g)") +
  xlab("Time (days)") +
  theme(strip.background = element_rect(color = "black", size = 0.65)) +
  theme(strip.text = element_text(size = 12)) +
  theme(legend.position = "none")

biomass_lineplot

#Prepare data for adding statistical significance results to line graphs
melt_data <- data
melt_data$Initial_pH <- NULL
melt_data$Measured_pH <- NULL

#Convert data from "wide" to "long" format for streamlined statistical analysis
aggregate_data <- reshape2::melt(melt_data, id = c("Sample_ID", "Genotype", "Treatment", "Days_of_measurement"))
aggregate_data$variable <- factor(aggregate_data$variable)

#calculate p-values via 2-way ANOVA for comparing Control and Mannitol-treated samples for each day of measurement and for each metric.

# Make an empty list to store the results
result_list <- list()

# Loop over each variable
for (i in 1:length(levels(aggregate_data$variable))) {
  
  # Loop over each Genotype
  for (j in 1:length(levels(aggregate_data$Genotype))) {
    
    # Subset data for each variable-Genotype combination
    subset_data <- subset(aggregate_data, variable == levels(aggregate_data$variable)[i] & Genotype == levels(aggregate_data$Genotype)[j])

    # 2-way ANOVA
    aov_result <- aov(value ~ Treatment*Days_of_measurement, data = subset_data)
    
    # Post-hoc analysis of 2-way ANOVA
    tukey_result <- TukeyHSD(aov_result)
    
    # Extract p-values
    p_value <- tukey_result$`Treatment:Days_of_measurement`[, 4]
    
    # Store the results in the list we already created
    result_list[[length(result_list) + 1]] <- data.frame(
      Genotype = levels(aggregate_data$Genotype)[j],
      Metric = levels(aggregate_data$variable)[i],
      p_value
    )
  }
}

# Combine the list of results into a data frame
result_df <- do.call(rbind, result_list)

#clean and organize the data frame for downstream analysis
result_df$contrasts <- rownames(result_df)
rownames(result_df) <- NULL
result_df <- separate(result_df, contrasts, into = c("var1", "var2"), sep = "-", remove = FALSE)
result_df$contrasts <- NULL
result_df <- separate(result_df, var1, into = c("Treatment_1", "Day_1"), sep = ":", remove = FALSE)
result_df <- separate(result_df, var2, into = c("Treatment_2", "Day_2"), sep = ":", remove = FALSE)
result_df$var1 <- NULL
result_df$var2 <- NULL
result_df$Day_2 <- substr(result_df$Day_2, 1, 1)

#Add columns for comparing Treatment and Day status
result_df$treatment_comparison <- ifelse(result_df$Treatment_1 == result_df$Treatment_2, "TRUE", "FALSE")
result_df$day_comparison <- ifelse(result_df$Day_1 == result_df$Day_2, "TRUE", "FALSE")

#Make a new data frame with only the contrasts of interest
cleaned_df <- subset(result_df, treatment_comparison %in% c("FALSE") & day_comparison %in% c("TRUE")) 
cleaned_df$Treatment_1 <- NULL
cleaned_df$Treatment_2 <- NULL
cleaned_df$treatment_comparison <- NULL
cleaned_df$day_comparison <- NULL
cleaned_df$Day_2 <- NULL

cleaned_df <- cleaned_df %>%
  rename(Day_of_measurement = Day_1)

cleaned_df <- cleaned_df %>%
  dplyr::select(Metric, Genotype, Day_of_measurement, p_value)

#add a column with asterisks according to the degree of statistical significance
cleaned_df <- cleaned_df %>%
  mutate(sig_level = case_when(
    p_value < 0.05 & p_value >= 0.01 ~ "*",
    p_value < 0.01 & p_value >= 0.001 ~ "**",
    p_value < 0.001 ~ "***",
    TRUE ~ ""
  ))

rownames(cleaned_df) <- NULL

#save results in a data frame with a clearer name
anova_results_by_metric_genotype_day <- cleaned_df

paged_table(anova_results_by_metric_genotype_day)

#add anova results to line graphs
biomass_lineplot <- biomass_lineplot + geom_text(data = subset(anova_results_by_metric_genotype_day, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Day_of_measurement, y = 6.5, label = sig_level), size = 4.5)

biomass_lineplot

#Now let's make plots for visualizing the linear regression of each metric-genotype-condition combination over the entire time course with the corresponding R-squared values

#convert Days_of_measurement to be a numerical value
data$Days_of_measurement <- as.double(data$Days_of_measurement)

biomass_rate <- ggplot(data, aes(x = Days_of_measurement, y = Cumulative_biomass_accumulation, color = Treatment)) +
  #regression line for individual samples
  geom_line(stat="smooth", method = "lm", formula = y ~ x, aes(group = Sample_ID, color = Treatment), se = FALSE, linewidth = 0.15, alpha = 0.5) +
  #average regression by treatment
  geom_smooth(formula = y ~ x, method = lm, aes(fill = Treatment), linetype = 1, size = 0.6, alpha = 0.8) +
  geom_jitter(cex = 0.25, width = 0.12, shape = 21, stroke = 0.4) +
  scale_color_manual(values = treatment_colors) +
  scale_fill_manual(values = treatment_colors) +
  facet_grid(.~ Genotype) +
  theme(plot.background = element_rect(fill = "transparent", color = "transparent")) +
  theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "black")) +
  theme(panel.grid.major = element_line(color = 'transparent')) +
  theme(panel.grid.minor = element_line(color = "transparent")) +
  theme(axis.line = element_line(color = "black", size = 0.65)) +
  theme(axis.ticks.length = unit(1.5, "mm")) +
  theme(axis.ticks = element_line(size = 0.75, color = "black")) +
  theme(axis.title = element_text(size = 12)) +
  theme(axis.text = element_text(colour = "black", size = 12)) +
  ylab("Biomass accumulation (g)") +
  xlab("Days of measurement") +
  theme(strip.background = element_rect(color = "black", size = 0.65)) +
  theme(strip.text = element_text(size = 12)) +
  theme(legend.position = "none") +
  #add R-squared values for each average regression line
  stat_cor(aes(label=paste(gsub("R", "r",..rr.label..),sep="*','~")),label.x=c(2,2),label.y = c(7.6,6.85), geom = "text", size = 5, r.accuracy = 0.01)

biomass_rate

#Now let's make plots to visualize and compare the 8-day rates (slopes of linear regression)

#First, we need to make a new data frame without Delta_pH values (from which we are not calculating rates)
rates_data_input <- aggregate_data

rates_data_input$variable <- unfactor(rates_data_input$variable)

rates_data_input <- subset(rates_data_input, variable != "Delta_pH")

rates_data_input$variable <- factor(rates_data_input$variable)

#convert Days_of_measurement to be a numeric value rather than a factor
rates_data_input$Days_of_measurement <- unfactor(rates_data_input$Days_of_measurement)

#
rate_result_list <- list()

for (i in 1:length(levels(rates_data_input$variable))) {
  current_level <- levels(rates_data_input$variable)[i]
  
  model_results <- rates_data_input %>%
    filter(variable == current_level) %>%
    group_by(Sample_ID) %>%
    do({
      model = lm(value ~ Days_of_measurement, data = .)
      tidied = tidy(model)
      data.frame(Sample_ID = unique(.$Sample_ID), tidied)
    })
  
  if (nrow(model_results) == 0) {
    warning("No non-missing data for level:", current_level)
  }
  
  rate_result_list[[i]] <- model_results %>%
    mutate(Metric = current_level)
}

# Combine the list of results into a data frame
rate_result_df <- do.call(rbind, rate_result_list)

#Keep just the rate values
rate_result_df <- subset(rate_result_df, term %in% "Days_of_measurement")

rate_result_df$term <- NULL

rate_result_df <- rate_result_df %>%
  rename(Rate = estimate)

rate_result_df <- rate_result_df %>%
  mutate(across(c("Rate", "std.error", "statistic", "p.value"), ~ round(., digits = 4)))

paged_table(rate_result_df)

#for streamlining visualization of rate plots, let's make a new data frame containing average rate values and 95% confidence intervals for each genotype-treatment-metric combination
rate_result_df$Metric <- factor(rate_result_df$Metric)

#add Genotype and Treatment information back into rate_result_df and convert both to be factors
rate_result_df$Genotype <- sub("^[^_]+_([^_]+).*", "\\1", rate_result_df$Sample_ID)
rate_result_df$Genotype <- factor(rate_result_df$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))

rate_result_df$Treatment <- sub("^[^_]+_[^_]+_([^_]+).*", "\\1", rate_result_df$Sample_ID)
rate_result_df$Treatment <- factor(rate_result_df$Treatment, levels = c("Control", "Mannitol"))

means <- rate_result_df %>% group_by(Genotype, Treatment, Metric) %>%
  summarise(.groups="keep", mean_rate = mean(Rate, na.rm = TRUE), CI_rate = (1.96*sd(Rate, na.rm = TRUE)/sqrt(n())))

#Now we can plot average, 8-day rates by metric, genotype, and condition
biomass_rate_plot <- ggplot(subset(rate_result_df, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, y = Rate)) +
  facet_grid(.~ Treatment) +
  geom_beeswarm(color = "darkgrey", cex = 2, size = 1) +
  geom_errorbar(inherit.aes = FALSE, data = subset(means, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, ymin = mean_rate - CI_rate, ymax = mean_rate + CI_rate, color = Genotype), 
                width = 0, size = 1) +
  stat_summary(fun = mean, size = 5, geom = "point", aes(group = Genotype, color = Genotype)) +
  theme(legend.position = "none") +
  theme(plot.background = element_rect(fill = "transparent", color = "transparent")) +
  theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "transparent")) +
  theme(panel.grid.major = element_line(color = 'transparent')) +
  theme(panel.grid.minor = element_line(color = "transparent")) +
  theme(axis.line = element_line(color = "black", size = 0.65)) +
  theme(axis.ticks.length = unit(1.5, "mm")) +
  theme(axis.ticks = element_line(size = 0.75, color = "black")) +
  theme(axis.title = element_text(size = 12)) +
  theme(axis.text = element_text(colour = "black", size = 12)) +
  ylab("Biomass accumulation (g/day)") +
  theme(axis.title.x = element_blank()) +
  theme(strip.background = element_rect(color = "black", size = 0.65)) +
  theme(strip.text = element_text(size = 12)) +
  theme(axis.text.x = element_text(angle = 90))

biomass_rate_plot

#Now let's statistically compare the 8-day rates for each genotype (separately for each Treatment)

#make a un-grouped data frame to serve as our input
rates_df <- data.frame(rate_result_df)

lin_regs_anova_result <- list()

# Loop over each variable
for (i in 1:length(levels(rate_result_df$Metric))) {
  
  # Loop over each Treatment
  for (j in 1:length(levels(rate_result_df$Treatment))) {
    
    # Subset data for each Metric-Treatment combination
    subset_data <- subset(rate_result_df, Metric == levels(rate_result_df$Metric)[i] & Treatment == levels(rate_result_df$Treatment)[j])

    # 2-way ANOVA
    model <- aov(Rate ~ Genotype, data = subset_data)
  
    # Post-hoc pairwise comparisons using emmeans
    model_means <- emmeans(model, specs = "Genotype")
  
    # Extract letter groups using cld function
    letters <- cld(model_means, Letters = as.character(LETTERS))
    
    #temp_df <- data.frame(letters)
    
    # Store the results in the list
    lin_regs_anova_result[[length(lin_regs_anova_result) + 1]] <- data.frame(
      Treatment = levels(rate_result_df$Treatment)[j],
      Metric = levels(rate_result_df$Metric)[i],
      Genotype = levels(rate_result_df$Genotype),
      Letters = as.character(letters$.group)
    )
  }
}

rate_anova_results <- do.call(rbind, lin_regs_anova_result)

#remove leading spaces from Tukey letters column
rate_anova_results$Letters <- trimws(rate_anova_results$Letters)

#view Tukey-HSD post-hoc letter results
paged_table(rate_anova_results)

#now let's add the Tukey letters to our rate plots
rate_anova_results$Treatment <- factor(rate_anova_results$Treatment, levels = c("Control", "Mannitol"))
rate_anova_results$Genotype <- factor(rate_anova_results$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))
rate_anova_results$Metric <- factor(rate_anova_results$Metric)

#We don't want to confuse the results of our 1-way ANOVA across for each treatment (i.e. Control) with the 1-way ANOVA results for the Mannitol samples. So let's convert case of letters for Mannitol samples to be lowercase
rate_anova_results <- rate_anova_results %>%
  mutate(Letters = ifelse(Treatment == "Mannitol", tolower(Letters), Letters))

biomass_rate_plot <- biomass_rate_plot + geom_text(data = subset(rate_anova_results, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, y = 1, label = Letters), size = 4.5)

biomass_rate_plot

Non-destructive, whole-plant phenotyping data analysis

Danny Ginzburg

2024-01-18