This is a pipeline describing the general approach for analyzing the SPIRO data from Jade’s experiment on hormone mutants and their reaction to NAT:
I m going to load the data from each SPIRO and add a collumn with the treatment of each plate - since I dont have it coded within my root names:
list.files()
## [1] "E01J01P01.csv"
## [2] "E01J01P02.csv"
## [3] "E01J01P03.csv"
## [4] "E01J01P04.csv"
## [5] "E01J02P01.csv"
## [6] "E01J02P02.csv"
## [7] "E01J02P03.csv"
## [8] "E01J02P04.csv"
## [9] "Figure.Experiment01.pdf"
## [10] "Jade_NAT_Experiment01.nb.html"
## [11] "Jade_NAT_Experiment01.Rmd"
## [12] "NAT_effect_Experiment01_btw_treatments_per_genotype.pdf"
J1P1 <- read.csv("E01J01P01.csv")
J1P1$treatment <- "NAT"
J1P2 <- read.csv("E01J01P02.csv")
J1P2$treatment <- "NAT"
J1P3 <- read.csv("E01J01P03.csv")
J1P3$treatment <- "MS"
J1P4 <- read.csv("E01J01P04.csv")
J1P4$treatment <- "MS"
J2P1 <- read.csv("E01J02P01.csv")
J2P1$treatment <- "MS"
J2P2 <- read.csv("E01J02P02.csv")
J2P2$treatment <- "NAT"
J2P3 <- read.csv("E01J02P03.csv")
J2P3$treatment <- "MS"
J2P4 <- read.csv("E01J02P04.csv")
J2P4$treatment <- "NAT"
Next - I want to mmerge all of my experimental data into one dataset:
data <- rbind(J1P1, J1P2, J1P3, J1P4, J2P1, J2P2, J2P3, J2P4)
data
cool - now let’s extract from image name the date and time so we can calculate hours of experiment:
strsplit(data$image[1], "-")
## [[1]]
## [1] "plate1" "20240618" "214018" "day.rsml"
strsplit(data$image[1], "-")[[1]][2]
## [1] "20240618"
strsplit(data$image[1], "-")[[1]][3]
## [1] "214018"
date is going to be the 2nd item, while time is going to be the third one - let’s save them into their own column.
Since our experiment does not go over change of month - we are going to be just OK extracting the day from the date and hour from the time stamp
# for day
substr(strsplit(data$image[1], "-")[[1]][2],7,8)
## [1] "18"
# for hour
substr(strsplit(data$image[1], "-")[[1]][3],1,2)
## [1] "21"
OK - now let’s loop it throughout the entire data:
for(i in 1:nrow(data)){
data$day[i] <- substr(strsplit(data$image[i], "-")[[1]][2],7,8)
data$hour[i] <- substr(strsplit(data$image[i], "-")[[1]][3],1,2)
}
data
awesome! Now - let’s extract only the columns we are going to use (length of the root), since our roots do not produce any lateral roots throughout the experiment:
colnames(data)
## [1] "image" "root_name" "root"
## [4] "length" "surface" "volume"
## [7] "diameter" "root_order" "root_ontology"
## [10] "parent_name" "parent" "insertion_position"
## [13] "insertion_angle" "n_child" "child_density"
## [16] "first_child" "insertion_first_child" "last_child"
## [19] "insertion_last_child" "treatment" "day"
## [22] "hour"
data
data2 <- data[,c(20:22,1:4)]
data2
ok - now let’s calculate time of experiment (in hours). This specific experiment was started on June 18th at 11:00 AM, and thus - TOE will be:
data2$TOE <- (as.numeric(as.character(data2$day)) - 18)*24 + (as.numeric(as.character(data2$hour)) - 11)
data2
OK - now - lets plot all of the different roots for their growth to see if the overall patterns are matching with our expectations:
library(ggplot2)
lgraph <- ggplot(data=data2, aes(x= TOE, y=length, group = root, color = treatment))
lgraph <- lgraph + geom_line(alpha = 0.7)
lgraph <- lgraph + ylab("root legnth") + xlab("hours of imaging")
lgraph
OK looks OK - with couple of exceptions - but we will get rid of them later.
First - let’s calculate splines for our plants, as well as the overall growth rate throughout this experiment for each genotype.
Let’s first isolate one plant and establish the calculations on one sample:
# Create a vector 'hours' containing values from 0 to 72 (3 days of imaging)
hours <- seq(0, 72, by = 1)
hours
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [26] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## [51] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
temp <- subset(data2, data2$root == unique(data2$root)[1])
# Convert the 'TOE' column in 'temp' from a factor to numeric
temp$TOE <- as.numeric(as.character(temp$TOE))
temp <- temp[order(temp$TOE, decreasing = F),]
# Display the 'hours' column from the 'temp' data frame
temp$TOE
## [1] 0 10 19 34 43 58 72
# Fit a cubic smoothing spline to the 'TOE' and 'length' data in the 'temp' data frame
plot.spl <- with(temp, smooth.spline(TOE, length, df = 5))
# Create a scatterplot of 'Area.sum' against 'days' using data from 'temp'
plot(length ~ TOE, data = temp)
# Add a blue line to the plot based on the cubic smoothing spline ('plot.spl')
lines(plot.spl, col = "blue")
# Add a red line to the plot by predicting values using the cubic smoothing spline ('plot.spl')
lines(predict(plot.spl, hours), col = "red")
Now - let’s save all of these data into spline file:
# Define a vector 'names' containing column names
names <- c(text = "root", "genotype", "treatment", "TOE", "length")
# Create an empty data frame 'spline_data' to store the data
spline_data <- data.frame()
# Loop through each element 'k' in the 'names' vector
for (k in names) {
# Create a new column in 'spline_data' with the name specified by 'k' and initialize it as character type
spline_data[[k]] <- as.character()
}
spline_data
pred_temp <- predict(plot.spl, hours)
# Display the predictions stored in 'pred_temp'
pred_temp
## $x
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [26] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## [51] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
##
## $y
## [1] 1.051824 1.074113 1.096382 1.118610 1.140778 1.162865 1.184851 1.206715
## [9] 1.228436 1.249996 1.271373 1.292550 1.313527 1.334305 1.354885 1.375270
## [17] 1.395462 1.415462 1.435273 1.454896 1.474337 1.493613 1.512743 1.531748
## [25] 1.550648 1.569464 1.588215 1.606921 1.625604 1.644282 1.662977 1.681708
## [33] 1.700495 1.719359 1.738320 1.757398 1.776606 1.795959 1.815472 1.835158
## [41] 1.855031 1.875106 1.895396 1.915916 1.936669 1.957624 1.978735 1.999961
## [49] 2.021259 2.042585 2.063896 2.085150 2.106303 2.127312 2.148135 2.168727
## [57] 2.189047 2.209051 2.228696 2.247951 2.266833 2.285371 2.303592 2.321527
## [65] 2.339204 2.356652 2.373899 2.390975 2.407907 2.424725 2.441458 2.458134
## [73] 2.474782
spline_data[1:73,4] <- pred_temp$x
spline_data[1:73,5] <- pred_temp$y
spline_data[1:73,1] <- temp$root[1]
spline_data[1:73,2] <- temp$root_name[1]
spline_data[1:73,3] <- temp$treatment[1]
spline_data
great - now let’s save it into another dataframe:
# Create a copy of 'spline_data' and store it in 'final_spline'
final_spline <- spline_data
# check how many unique plant IDs we have
all_plants <- unique(data2$root)
length(all_plants)
## [1] 221
Cool - now lets calculate it for all of the plants:
for (i in 2:221) {
temp <- subset(data2, data2$root == unique(data2$root)[i])
# Check if 'temp' has more than 4 rows
if (dim(temp)[1] > 4) {
# Fit a cubic smoothing spline to 'days' and 'Area.sum' in 'temp'
temp$TOE <- as.numeric(as.character(temp$TOE))
temp <- temp[order(temp$TOE, decreasing = F),]
plot.spl <- with(temp, smooth.spline(TOE, length, df = 5))
# Generate predictions using the smoothing spline
pred_temp <- predict(plot.spl, hours)
# Update specific columns in 'spline_data' with 'pred_temp' and 'temp' values
spline_data[1:73,4] <- pred_temp$x
spline_data[1:73,5] <- pred_temp$y
spline_data[1:73,1] <- temp$root[1]
spline_data[1:73,2] <- temp$root_name[1]
spline_data[1:73,3] <- temp$treatment[1]
# Append 'spline_data' to 'final_spline'
final_spline <- rbind(final_spline, spline_data)
} else {
# Set specific columns in 'spline_data' when 'temp' has <= 4 rows
spline_data[1:73,4] <- hours
spline_data[1:73,5] <- 0
spline_data[1:73,1] <- temp$root[1]
spline_data[1:73,2] <- temp$root_name[1]
spline_data[1:73,3] <- temp$treatment[1]
}
}
final_spline
Let’s do some small cleanup:
final_spline$TOE <- as.numeric(as.character(final_spline$TOE))
# there are some stray roots that were not assigned per genotype = let's remove them
bye <- c(" root_0", " root_2", " root_9")
final_spline <- subset(final_spline, !(final_spline$genotype %in% bye))
unique(final_spline$genotype)
## [1] " 14" " wt" " 15" " 13" " 10" " 12" " 11" " 9" " 3" " 2" " 4" " 1"
## [13] " 7" " 8" " 6" " 5"
# Convert the 'length' column in 'final_spline' to numeric type
final_spline$length <- as.numeric(as.character(final_spline$length))
final_spline
this looks great - now let’s re-plot it with some additional stats:
library(ggsci)
library(ggpubr)
# Convert 'TOE' column to a factor with numeric levels
final_spline$TOE <- as.factor(as.numeric(as.character(final_spline$TOE)))
# Create a line graph (Area_lgraph) using ggplot2
Length_lgraph <- ggplot(data = final_spline, aes(x = TOE, y = length, group = root, color = treatment))
Length_lgraph <- Length_lgraph + geom_line(alpha = 0.1)
Length_lgraph <- Length_lgraph + stat_summary(fun.data = mean_se, geom = "ribbon", linetype = 0, aes(group = treatment), alpha = 0.3)
Length_lgraph <- Length_lgraph + stat_summary(fun = mean, aes(group = treatment), size = 0.7, geom = "line", linetype = "dashed")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Length_lgraph <- Length_lgraph + stat_compare_means(aes(group = treatment), label = "p.signif", method = "t.test", hide.ns = T)
Length_lgraph <- Length_lgraph + labs(x = "Hours of Imaging", y = "length (cm)") + scale_color_d3("category10") + theme(legend.position = "bottom")
Length_lgraph
OK - while this seems great - we still would like to know a little more
about how different genotypes are behaving here. Let’s facet this graph
per genotype:
# Let's order genotypes in the order that I d like to see them:
final_spline$genotype <- factor(final_spline$genotype, levels =c(" wt", " 1", " 2", " 3", " 4", " 5", " 6", " 7", " 8", " 9", " 10", " 11", " 12", " 13", " 14", " 15"))
final_spline$TOE <- as.numeric(final_spline$TOE)
Length_lgraph <- ggplot(data = final_spline, aes(x = TOE, y = length, group = root, color = treatment))
Length_lgraph <- Length_lgraph + geom_line(alpha = 0.1)
Length_lgraph <- Length_lgraph + facet_wrap(~ genotype)
Length_lgraph <- Length_lgraph + stat_summary(fun.data = mean_se, geom = "ribbon", linetype = 0, aes(group = treatment), alpha = 0.3)
Length_lgraph <- Length_lgraph + stat_summary(fun = mean, aes(group = treatment), size = 0.7, geom = "line", linetype = "dashed")
#Length_lgraph <- Length_lgraph + stat_compare_means(aes(group = treatment), label = "p.signif", method = "t.test", hide.ns = T)
Length_lgraph <- Length_lgraph + labs(x = "Hours of Imaging", y = "length (cm)") + scale_color_d3("category10") + theme(legend.position = "bottom")
Length_lgraph
additionally - we can calculate a linear growth rate by fitting a linear regression into the observed data:
temp <- subset(final_spline, final_spline$root == unique(final_spline$root)[1])
temp$length <- as.numeric(temp$length)
temp$TOE <- as.numeric(temp$TOE)
temp
plot(length ~ TOE, data = temp)
plot(temp$length ~ temp$TOE) +
# Linear regression line is added to the scatterplot using the lm() function
abline(lm(temp$length ~ temp$TOE))
## integer(0)
# Fit a linear regression model to predict 'length' based on 'hours' in the 'temp' data frame
model <- lm(temp$length ~ temp$TOE)
model
##
## Call:
## lm(formula = temp$length ~ temp$TOE)
##
## Coefficients:
## (Intercept) temp$TOE
## 1.0503 0.0198
# Extract the growth rate (GR) coefficient from the linear regression model
GR <- model$coefficients[2]
GR
## temp$TOE
## 0.01979515
# Calculate the coefficient of determination (R-squared) for the model
R2 <- summary(model)$r.squared
R2
## [1] 0.9996501
# Define vector 'names' containing column names for the 'growth_data' data frame
names <- c(text = "root", "genotype", "treatment", "GR", "R2")
# Create an empty data frame 'growth_data' to store growth-related data
growth_data <- data.frame()
# Loop through each element 'k' in the 'names' vector
for (k in names) {
# Create a new column in 'growth_data' with the name specified by 'k' and set as character type
growth_data[[k]] <- as.character()
}
# Assign the 'ID' value from the 'temp' data frame to the first row, first column of 'growth_data'
growth_data[1, 1] <- temp$root[1]
# Assign the 'Accession' value from the 'temp' data frame to the first row, second column of 'growth_data'
growth_data[1, 2] <- as.character(temp$genotype[1])
# Assign the 'Treatment' value from the 'temp' data frame to the first row, third column of 'growth_data'
growth_data[1, 3] <- temp$treatment[1]
# Assign the growth rate (GR) to the first row, fourth column of 'growth_data'
growth_data[1, 4] <- GR
# Assign the coefficient of determination (R-squared, R2) to the first row, fifth column of 'growth_data'
growth_data[1, 5] <- R2
growth_data
ok - now let’s calculate it for all:
# Create vector 'all_plants' containing unique 'ID' values from 'final_spline'
all_plants <- unique(final_spline$root)
# Loop through each unique 'ID' in 'all_plants'
for (i in 1:length(all_plants)) {
# Subset 'final_spline' to select data for the current 'ID'
temp <- subset(final_spline, final_spline$root == all_plants[i])
temp$length <- as.numeric(temp$length)
temp$TOE <- as.numeric(temp$TOE)
# Fit a linear regression model to predict 'Area.SUM' based on 'day'
model <- lm(temp$length ~ temp$TOE)
# Extract the growth rate (GR) coefficient from the linear regression model
GR <- model$coefficients[2]
# Calculate the coefficient of determination (R-squared, R2) for the model
R2 <- summary(model)$r.squared
# Populate 'growth_data' with relevant information for the current 'ID'
growth_data[i, 1] <- temp$root[1]
growth_data[i, 2] <- as.character(temp$genotype[1])
growth_data[i, 3] <- temp$treatment[1]
growth_data[i, 4] <- GR
growth_data[i, 5] <- R2
}
growth_data
Let’s fo a little clean up:
# Convert the 'GR' and 'R2' columns in 'growth_data' to numeric
growth_data$GR <- as.numeric(as.character(growth_data$GR))
growth_data$R2 <- as.numeric(as.character(growth_data$R2))
growth_data_clean <- subset(growth_data, growth_data$R2 > 0.7)
growth_data_clean <- subset(growth_data_clean, growth_data_clean$GR > 0)
growth_data_clean
Great - now we have all of this data - let’s plot it:
# Let's order genotypes in the order that I d like to see them:
growth_data_clean$genotype <- factor(growth_data_clean$genotype, levels =c(" wt", " 1", " 2", " 3", " 4", " 5", " 6", " 7", " 8", " 9", " 10", " 11", " 12", " 13", " 14", " 15"))
growth_data_clean$treatment <- factor(growth_data_clean$treatment, levels =c("MS", "NAT"))
GR_overall <- ggerrorplot(growth_data_clean, x = "treatment", y = "GR", color = "treatment", fill = "treatment", facet.by = "genotype",
desc_stat = "mean_sd", add = "jitter", ncol = 8,
xlab="", ylab= "Growth Rate (cm / hour)", add.params = list(color = "darkgray")) + scale_color_d3("category10")
GR_overall <- GR_overall + stat_compare_means(method = "aov", label.y = 0.04)
GR_overall <- GR_overall + rremove("legend")
GR_overall
OK - this is one way of seeing it - which is - do we see differences
between treatments. Not much differences - but let’s save it as PDF for
a better view.
pdf("NAT_effect_Experiment01_btw_treatments_per_genotype.pdf", width = 15, height = 10)
print(GR_overall)
dev.off()
## quartz_off_screen
## 2
OK - there are definitely few that show SIGNIFICANT differences between treatment and control. Which is COOL! <3
Now - let’s look at the differences between genotypes:
GR_overall02 <- ggerrorplot(growth_data_clean, x = "genotype", y = "GR", color = "genotype", fill = "genotype", facet.by = "treatment",
desc_stat = "mean_sd", add = "jitter", ncol = 8,
xlab="", ylab= "Growth Rate (cm / hour)", add.params = list(color = "darkgray"))
GR_overall02 <- GR_overall02 + stat_compare_means(aes(label = after_stat(p.signif)), method = "t.test", ref.group = " wt", hide.ns = T)
GR_overall02 <- GR_overall02 + rremove("legend")
GR_overall02
now let’s save these figures into one nice figure:
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
pdf("Figure.Experiment01.pdf", height = 25, width = 15)
plot_grid(Length_lgraph, GR_overall, GR_overall02, labels = "AUTO", ncol = 1)
dev.off()
## quartz_off_screen
## 2