#Load necessary libraries
library(readxl)
library(rmarkdown)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(pysd2r)
library(varhandle)
library(ggbeeswarm)
library(broom)
library(multcompView)
library(multcomp)
library(ggpubr)
library(rcompanion)
library(emmeans)
#Set working directory
setwd("/Users/danny/whole_plant_phenotyping/")
#load data into R
data <- read_xlsx("master_file.xlsx")
#convert variables to factors, and order factors as desired
data$Genotype <- factor(data$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))
data$Days_of_measurement <- factor(data$Days_of_measurement)
data$Treatment <- factor(data$Treatment, levels = c("Control", "Mannitol"))
data$Sample_ID <- factor(data$Sample_ID)
#view data
paged_table(data)
#assign custom colors to each treatment
treatment_colors <- c("#56b48b", "#E69F00")
#Make linegraphs of average values per day and genotype, separated by treatment
biomass_lineplot <- ggplot(data, aes(x = Days_of_measurement, y = Cumulative_biomass_accumulation)) +
#draw 95% confidence interval around average value for each day and treatment
stat_summary(fun.data = mean_cl_normal, geom = "ribbon", linetype = 0, aes(group = Treatment), alpha = 0.25) +
#draw line connecting average values per genotype and treatment by day
stat_summary(fun = mean, aes(group = Treatment, color = Treatment), size = 0.9, geom = "line", linetype = "solid") +
#draw lines for each individual sample and make them somewhat transparent for visual clarity
stat_summary(fun = mean, aes(group = Sample_ID, color = Treatment), size = 0.45, alpha = 0.2, geom = "line", linetype = "solid") +
#group by genotype
facet_grid(.~ Genotype) +
#add custom colors for each treatment
scale_color_manual(values = treatment_colors) +
scale_fill_manual(values = treatment_colors) +
theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "transparent")) +
theme(panel.grid.major = element_line(color = 'transparent')) +
theme(panel.grid.minor = element_line(color = "transparent")) +
theme(axis.line = element_line(color = "black", size = 0.65)) +
theme(axis.ticks.length = unit(1.5, "mm")) +
theme(axis.ticks = element_line(size = 0.75, color = "black")) +
theme(axis.title = element_text(size = 12)) +
theme(axis.text = element_text(colour = "black", size = 12)) +
ylab("Average biomass accumulation (g)") +
xlab("Time (days)") +
theme(strip.background = element_rect(color = "black", size = 0.65)) +
theme(strip.text = element_text(size = 12)) +
theme(legend.position = "none")
biomass_lineplot

#Prepare data for adding statistical significance results to line graphs
melt_data <- data
melt_data$Initial_pH <- NULL
melt_data$Measured_pH <- NULL
#Convert data from "wide" to "long" format for streamlined statistical analysis
aggregate_data <- reshape2::melt(melt_data, id = c("Sample_ID", "Genotype", "Treatment", "Days_of_measurement"))
aggregate_data$variable <- factor(aggregate_data$variable)
#calculate p-values via 2-way ANOVA for comparing Control and Mannitol-treated samples for each day of measurement and for each metric.
# Make an empty list to store the results
result_list <- list()
# Loop over each variable
for (i in 1:length(levels(aggregate_data$variable))) {
# Loop over each Genotype
for (j in 1:length(levels(aggregate_data$Genotype))) {
# Subset data for each variable-Genotype combination
subset_data <- subset(aggregate_data, variable == levels(aggregate_data$variable)[i] & Genotype == levels(aggregate_data$Genotype)[j])
# 2-way ANOVA
aov_result <- aov(value ~ Treatment*Days_of_measurement, data = subset_data)
# Post-hoc analysis of 2-way ANOVA
tukey_result <- TukeyHSD(aov_result)
# Extract p-values
p_value <- tukey_result$`Treatment:Days_of_measurement`[, 4]
# Store the results in the list we already created
result_list[[length(result_list) + 1]] <- data.frame(
Genotype = levels(aggregate_data$Genotype)[j],
Metric = levels(aggregate_data$variable)[i],
p_value
)
}
}
# Combine the list of results into a data frame
result_df <- do.call(rbind, result_list)
#clean and organize the data frame for downstream analysis
result_df$contrasts <- rownames(result_df)
rownames(result_df) <- NULL
result_df <- separate(result_df, contrasts, into = c("var1", "var2"), sep = "-", remove = FALSE)
result_df$contrasts <- NULL
result_df <- separate(result_df, var1, into = c("Treatment_1", "Day_1"), sep = ":", remove = FALSE)
result_df <- separate(result_df, var2, into = c("Treatment_2", "Day_2"), sep = ":", remove = FALSE)
result_df$var1 <- NULL
result_df$var2 <- NULL
result_df$Day_2 <- substr(result_df$Day_2, 1, 1)
#Add columns for comparing Treatment and Day status
result_df$treatment_comparison <- ifelse(result_df$Treatment_1 == result_df$Treatment_2, "TRUE", "FALSE")
result_df$day_comparison <- ifelse(result_df$Day_1 == result_df$Day_2, "TRUE", "FALSE")
#Make a new data frame with only the contrasts of interest
cleaned_df <- subset(result_df, treatment_comparison %in% c("FALSE") & day_comparison %in% c("TRUE"))
cleaned_df$Treatment_1 <- NULL
cleaned_df$Treatment_2 <- NULL
cleaned_df$treatment_comparison <- NULL
cleaned_df$day_comparison <- NULL
cleaned_df$Day_2 <- NULL
cleaned_df <- cleaned_df %>%
rename(Day_of_measurement = Day_1)
cleaned_df <- cleaned_df %>%
dplyr::select(Metric, Genotype, Day_of_measurement, p_value)
#add a column with asterisks according to the degree of statistical significance
cleaned_df <- cleaned_df %>%
mutate(sig_level = case_when(
p_value < 0.05 & p_value >= 0.01 ~ "*",
p_value < 0.01 & p_value >= 0.001 ~ "**",
p_value < 0.001 ~ "***",
TRUE ~ ""
))
rownames(cleaned_df) <- NULL
#save results in a data frame with a clearer name
anova_results_by_metric_genotype_day <- cleaned_df
paged_table(anova_results_by_metric_genotype_day)
#add anova results to line graphs
biomass_lineplot <- biomass_lineplot + geom_text(data = subset(anova_results_by_metric_genotype_day, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Day_of_measurement, y = 6.5, label = sig_level), size = 4.5)
biomass_lineplot

#Now let's make plots for visualizing the linear regression of each metric-genotype-condition combination over the entire time course with the corresponding R-squared values
#convert Days_of_measurement to be a numerical value
data$Days_of_measurement <- as.double(data$Days_of_measurement)
biomass_rate <- ggplot(data, aes(x = Days_of_measurement, y = Cumulative_biomass_accumulation, color = Treatment)) +
#regression line for individual samples
geom_line(stat="smooth", method = "lm", formula = y ~ x, aes(group = Sample_ID, color = Treatment), se = FALSE, linewidth = 0.15, alpha = 0.5) +
#average regression by treatment
geom_smooth(formula = y ~ x, method = lm, aes(fill = Treatment), linetype = 1, size = 0.6, alpha = 0.8) +
geom_jitter(cex = 0.25, width = 0.12, shape = 21, stroke = 0.4) +
scale_color_manual(values = treatment_colors) +
scale_fill_manual(values = treatment_colors) +
facet_grid(.~ Genotype) +
theme(plot.background = element_rect(fill = "transparent", color = "transparent")) +
theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "black")) +
theme(panel.grid.major = element_line(color = 'transparent')) +
theme(panel.grid.minor = element_line(color = "transparent")) +
theme(axis.line = element_line(color = "black", size = 0.65)) +
theme(axis.ticks.length = unit(1.5, "mm")) +
theme(axis.ticks = element_line(size = 0.75, color = "black")) +
theme(axis.title = element_text(size = 12)) +
theme(axis.text = element_text(colour = "black", size = 12)) +
ylab("Biomass accumulation (g)") +
xlab("Days of measurement") +
theme(strip.background = element_rect(color = "black", size = 0.65)) +
theme(strip.text = element_text(size = 12)) +
theme(legend.position = "none") +
#add R-squared values for each average regression line
stat_cor(aes(label=paste(gsub("R", "r",..rr.label..),sep="*','~")),label.x=c(2,2),label.y = c(7.6,6.85), geom = "text", size = 5, r.accuracy = 0.01)
biomass_rate

#Now let's make plots to visualize and compare the 8-day rates (slopes of linear regression)
#First, we need to make a new data frame without Delta_pH values (from which we are not calculating rates)
rates_data_input <- aggregate_data
rates_data_input$variable <- unfactor(rates_data_input$variable)
rates_data_input <- subset(rates_data_input, variable != "Delta_pH")
rates_data_input$variable <- factor(rates_data_input$variable)
#convert Days_of_measurement to be a numeric value rather than a factor
rates_data_input$Days_of_measurement <- unfactor(rates_data_input$Days_of_measurement)
#
rate_result_list <- list()
for (i in 1:length(levels(rates_data_input$variable))) {
current_level <- levels(rates_data_input$variable)[i]
model_results <- rates_data_input %>%
filter(variable == current_level) %>%
group_by(Sample_ID) %>%
do({
model = lm(value ~ Days_of_measurement, data = .)
tidied = tidy(model)
data.frame(Sample_ID = unique(.$Sample_ID), tidied)
})
if (nrow(model_results) == 0) {
warning("No non-missing data for level:", current_level)
}
rate_result_list[[i]] <- model_results %>%
mutate(Metric = current_level)
}
# Combine the list of results into a data frame
rate_result_df <- do.call(rbind, rate_result_list)
#Keep just the rate values
rate_result_df <- subset(rate_result_df, term %in% "Days_of_measurement")
rate_result_df$term <- NULL
rate_result_df <- rate_result_df %>%
rename(Rate = estimate)
rate_result_df <- rate_result_df %>%
mutate(across(c("Rate", "std.error", "statistic", "p.value"), ~ round(., digits = 4)))
paged_table(rate_result_df)
#for streamlining visualization of rate plots, let's make a new data frame containing average rate values and 95% confidence intervals for each genotype-treatment-metric combination
rate_result_df$Metric <- factor(rate_result_df$Metric)
#add Genotype and Treatment information back into rate_result_df and convert both to be factors
rate_result_df$Genotype <- sub("^[^_]+_([^_]+).*", "\\1", rate_result_df$Sample_ID)
rate_result_df$Genotype <- factor(rate_result_df$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))
rate_result_df$Treatment <- sub("^[^_]+_[^_]+_([^_]+).*", "\\1", rate_result_df$Sample_ID)
rate_result_df$Treatment <- factor(rate_result_df$Treatment, levels = c("Control", "Mannitol"))
means <- rate_result_df %>% group_by(Genotype, Treatment, Metric) %>%
summarise(.groups="keep", mean_rate = mean(Rate, na.rm = TRUE), CI_rate = (1.96*sd(Rate, na.rm = TRUE)/sqrt(n())))
#Now we can plot average, 8-day rates by metric, genotype, and condition
biomass_rate_plot <- ggplot(subset(rate_result_df, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, y = Rate)) +
facet_grid(.~ Treatment) +
geom_beeswarm(color = "darkgrey", cex = 2, size = 1) +
geom_errorbar(inherit.aes = FALSE, data = subset(means, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, ymin = mean_rate - CI_rate, ymax = mean_rate + CI_rate, color = Genotype),
width = 0, size = 1) +
stat_summary(fun = mean, size = 5, geom = "point", aes(group = Genotype, color = Genotype)) +
theme(legend.position = "none") +
theme(plot.background = element_rect(fill = "transparent", color = "transparent")) +
theme(panel.background = element_rect(fill = "transparent", color = NA), axis.line = element_line(colour = "transparent")) +
theme(panel.grid.major = element_line(color = 'transparent')) +
theme(panel.grid.minor = element_line(color = "transparent")) +
theme(axis.line = element_line(color = "black", size = 0.65)) +
theme(axis.ticks.length = unit(1.5, "mm")) +
theme(axis.ticks = element_line(size = 0.75, color = "black")) +
theme(axis.title = element_text(size = 12)) +
theme(axis.text = element_text(colour = "black", size = 12)) +
ylab("Biomass accumulation (g/day)") +
theme(axis.title.x = element_blank()) +
theme(strip.background = element_rect(color = "black", size = 0.65)) +
theme(strip.text = element_text(size = 12)) +
theme(axis.text.x = element_text(angle = 90))
biomass_rate_plot

#Now let's statistically compare the 8-day rates for each genotype (separately for each Treatment)
#make a un-grouped data frame to serve as our input
rates_df <- data.frame(rate_result_df)
lin_regs_anova_result <- list()
# Loop over each variable
for (i in 1:length(levels(rate_result_df$Metric))) {
# Loop over each Treatment
for (j in 1:length(levels(rate_result_df$Treatment))) {
# Subset data for each Metric-Treatment combination
subset_data <- subset(rate_result_df, Metric == levels(rate_result_df$Metric)[i] & Treatment == levels(rate_result_df$Treatment)[j])
# 2-way ANOVA
model <- aov(Rate ~ Genotype, data = subset_data)
# Post-hoc pairwise comparisons using emmeans
model_means <- emmeans(model, specs = "Genotype")
# Extract letter groups using cld function
letters <- cld(model_means, Letters = as.character(LETTERS))
#temp_df <- data.frame(letters)
# Store the results in the list
lin_regs_anova_result[[length(lin_regs_anova_result) + 1]] <- data.frame(
Treatment = levels(rate_result_df$Treatment)[j],
Metric = levels(rate_result_df$Metric)[i],
Genotype = levels(rate_result_df$Genotype),
Letters = as.character(letters$.group)
)
}
}
rate_anova_results <- do.call(rbind, lin_regs_anova_result)
#remove leading spaces from Tukey letters column
rate_anova_results$Letters <- trimws(rate_anova_results$Letters)
#view Tukey-HSD post-hoc letter results
paged_table(rate_anova_results)
#now let's add the Tukey letters to our rate plots
rate_anova_results$Treatment <- factor(rate_anova_results$Treatment, levels = c("Control", "Mannitol"))
rate_anova_results$Genotype <- factor(rate_anova_results$Genotype, levels = c("RTx430", "BTx623", "LTA", "Grassl", "Tx7078"))
rate_anova_results$Metric <- factor(rate_anova_results$Metric)
#We don't want to confuse the results of our 1-way ANOVA across for each treatment (i.e. Control) with the 1-way ANOVA results for the Mannitol samples. So let's convert case of letters for Mannitol samples to be lowercase
rate_anova_results <- rate_anova_results %>%
mutate(Letters = ifelse(Treatment == "Mannitol", tolower(Letters), Letters))
biomass_rate_plot <- biomass_rate_plot + geom_text(data = subset(rate_anova_results, Metric %in% c("Cumulative_biomass_accumulation")), aes(x = Genotype, y = 1, label = Letters), size = 4.5)
biomass_rate_plot
