Applied packages
library(tidyverse)
library(vegan)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(cowplot)
library(scales)
library(colorspace)
library(zoo)
Data set up
asv_dat<- read.csv("WEC_ASV_data.csv", row.names = 1) # treat as index/labels
asv_dat<- asv_dat %>%
t()%>% # transposes so that WEC is column name
as.data.frame()%>% # rewrites transposed column as a dataframe (needed)
rownames_to_column("Sample")
asv_dat <-asv_dat%>%
mutate(Total_Reads=rowSums(select(.,starts_with(("WEC")))))
asv_table<-asv_dat%>%
select(starts_with("WEC"))
env_dat<- read.csv("WEC_environmental_data.csv")
env_dat$season <- factor(env_dat$season,
levels = c("spring","summer","autumn","winter"))
env_dat$Date <- as.Date(env_dat$Date,"%Y-%m-%d")
taxa_dat<- read.csv("WEC_taxonomic_data.csv")
Alpha diversity
asv_abundance<-asv_table/rowSums(asv_table)
shannon_indices<- diversity(asv_abundance, index="shannon")
diversity_df<-data.frame(
Sample=asv_dat[,1],
Shannon_Index=shannon_indices
)
Single parameter analysis
merged_env<-env_dat%>%
left_join(diversity_df,by="Sample")
env_means <- merged_env %>%
summarise(across(
.cols = c(Shannon_Index, NITRATE_NITRITE, PHOSPHATE, salinity_PSU, oxygen_uM, temp_C),
.fns = ~mean(.x, na.rm = TRUE)
))
print(env_means)
env_totalmeans <- merged_env %>%
summarise(across(where(is.numeric), ~mean(.x, na.rm = TRUE)))
#In case for later calculations or justifications
#Temperature
temp <- ggplot(merged_env,aes(x=temp_C,y=Shannon_Index))+
geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
geom_smooth(method="lm",col="blue", na.rm = TRUE)+
stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.9)+
labs(x="Temperature (°C)",y="Alpha Diveristy")+
geom_point(data = env_means,
aes(x = env_means$temp_C, y = env_means$Shannon_Index),
color = "red", size = 4) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
ggsave("temp.png",height=5, width=8.39)
#Oxygen
oxy <- ggplot(merged_env,aes(x=oxygen_uM,y=Shannon_Index))+
geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
geom_smooth(method="lm",col="blue", na.rm = TRUE)+
coord_cartesian(xlim = c(175, 350))+ # zooms in without erasing outlier
labs(x="Oxygen (µM)",y="Alpha Diveristy")+
stat_poly_eq(use_label(c("R2")), label.x = 0.05, label.y = 0.1)+
geom_point(data = env_means,
aes(x = env_means$oxygen_uM, y = env_means$Shannon_Index),
color = "red", size = 4) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
ggsave("oxy.png",height=5, width=8.39)
#Salinity
sal <- ggplot(merged_env,aes(x=salinity_PSU,y=Shannon_Index))+
geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
geom_smooth(method="lm",col="blue", na.rm=TRUE)+
stat_poly_eq(use_label(c("R2")), label.x = 0.2, label.y = 0.1)+
labs(x="Salinity (PSU)",y="Alpha Diveristy")+
geom_point(data = env_means,
aes(x = env_means$salinity_PSU, y = env_means$Shannon_Index),
color = "red", size = 4) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
ggsave("sal.png",height=5, width=8.39)
#Phosphate
pho <- ggplot(merged_env,aes(x=PHOSPHATE,y=Shannon_Index))+
geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
geom_smooth(method="lm",col="blue", na.rm=TRUE)+
stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.1)+
labs(x="Phosphate (µM)",y="Alpha Diveristy")+
geom_point(data = env_means,
aes(x = env_means$PHOSPHATE, y = env_means$Shannon_Index),
color = "red", size = 4) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
ggsave("pho.png",height=5, width=8.39)
#Nitrate_Nitrite
nit <- ggplot(merged_env,aes(x=NITRATE_NITRITE,y=Shannon_Index))+
geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
geom_smooth(method="lm",col="blue", na.rm=TRUE)+
labs(x="Nitrate - Nitrite (µM)",y="Alpha Diveristy")+
stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.1)+
geom_point(data = env_means,
aes(x = env_means$NITRATE_NITRITE, y = env_means$Shannon_Index),
color = "red", size = 4) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
ggsave("nit.png",height=5, width=8.39)
nit_lm <- lm(Shannon_Index ~ NITRATE_NITRITE, data = merged_env)
summary(nit_lm)
Multiple regression
mod1 <- lm(Shannon_Index ~ AMMONIA + SILICATE +
NITRATE_NITRITE, data = merged_env)
mod2 <- lm(Shannon_Index ~ AMMONIA + SILICATE +
NITRATE_NITRITE + PHOSPHATE, data = merged_env)
summary(mod1)
#Adjusted R2 = 0.4644 (46% of the variance can be explained by x, y, and z)
#Better than a simple linear model as just Nitrate/Nitrite = 0.39
#R2 needs to be adjusted as more variables (even weakly associated) increases R2
summary(mod1)$coefficient
# t value = is there a significant association between the predictor and outcome variable
# coefficient = average effect on y of a one unit increase in predictor
confint(mod1)
AIC(mod1)
#Even though AIC is a tad higher than mod2, it is better
sigma(mod1)/mean(merged_env$Shannon_Index)
#The RSE(Residual Standard Error) is 0.07 (7% error rate)
# Highly precise and reliable statistical estimate
used_indices <-names(mod1$residuals)
merged_env_clean <- merged_env[used_indices,]
merged_env_clean$Predicted <- predict(mod1)
stats_label <- "AIC: 107 ~~ italic(R)^2: 0.46"
#Title: "Observed vs. Predicted Alpha Diversity Over Time"
multi_reg <- ggplot(merged_env_clean, aes(x = Date)) +
geom_point(aes(y = Shannon_Index), alpha = 0.5, size = 1) +
geom_line(aes(y = Predicted), color = "blue", size = 1) +
labs(y = "Alpha Diversity (Shannon Index)",
x = "Year")+
theme_classic()
print(multi_reg)
multi_reg + annotate(
"text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.1,
label = stats_label,
parse = TRUE
)
ggsave("multi_reg.png", height=5, width=8.39)
Seasonal variation of Flavobacteriaceae family
flav_id <- taxa_dat %>%
filter(Family == "Flavobacteriaceae") %>% # use this to continue the tidyverse
pull(ASV)
flav_abundance <- asv_dat%>%
mutate(Flav_Reads= rowSums(select(., any_of(flav_id))))%>%
mutate(Flav_RelAbund=(Flav_Reads/Total_Reads)*100)%>%
select(Sample,Flav_RelAbund)
#need to transpose the asv data again
flav_matrix <- asv_dat %>%
select(any_of(flav_id))
flav_diversity <- data.frame(
Sample = asv_dat$Sample,
Richness = specnumber(flav_matrix),
Shannon = diversity(flav_matrix, index="shannon")
)
flav_results <- env_dat %>%
left_join(flav_abundance, by = "Sample")%>%
left_join(flav_diversity, by = "Sample")%>%
arrange(Date)
str(flav_results)
facet_names <- c(
"Shannon" = "Alpha diversity",
"Flav_RelAbund" = "Relative Abundance (%)"
)
season_var <- flav_results %>%
select(Date, Shannon, Flav_RelAbund, season) %>%
pivot_longer(cols = c(Shannon, Flav_RelAbund),
names_to = "Metric", values_to = "Value") %>%
ggplot(aes(x = Date, y = Value, color = season)) +
geom_point(alpha = 0.5) +
geom_smooth(color = "black", size = 0.8, se = FALSE) +
facet_wrap(~Metric, scales = "free_y", ncol = 1,
labeller = as_labeller(facet_names)) +
theme_bw() +
theme(
panel.grid = element_blank()
)+
labs(
x = "Date",
y = "Value",
color = "Season"
) +
scale_color_manual(values = c("autumn"="dodgerblue4","spring"="cyan3",
"summer"="blue","winter"="orange"),
labels = c("autumn" = "Autumn", "spring" = "Spring",
"summer" = "Summer", "winter" = "Winter"))
ggsave("season_var.png",height=5, width=8.39)
season_aov <- aov(Shannon ~ season, data = flav_results)
summary(season_aov)
# Significant p < 0.001
season_tukey <- TukeyHSD(season_aov)
print(season_tukey)
shapiro.test(season_aov$residuals)
# Passed normality test as p value is not significant
Fourier analysis
ts_data <- flav_results %>%
group_by(Date) %>%
summarise(RelAbund = mean(Flav_RelAbund, na.rm = TRUE)) %>%
arrange(Date)
all_dates <- data.frame(
Date = seq(min(ts_data$Date), max(ts_data$Date), by = "7 days")
)
ts_regular <- all_dates %>%
left_join(ts_data, by = "Date") %>%
mutate(RelAbund_Interp = na.approx(RelAbund))
# runs the Periodogram
flux_spec <- spec.pgram(ts_regular$RelAbund_Interp - mean(ts_regular$RelAbund_Interp),
taper = 0,
span = c(3, 3), # A common stable smoothing setting
plot = FALSE)
# convert frequency to a time scale (Days)
# interval is 7 days
spec_df <- data.frame(
freq = flux_spec$freq,
spec = flux_spec$spec,
period_days = (1 / flux_spec$freq) * 7
)
four <- ggplot(spec_df, aes(x = period_days, y = spec)) +
geom_line(color = "steelblue") +
scale_x_continuous(limits = c(0, 500), breaks = c(7, 30, 90, 180, 365)) +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"))+
labs(
x = "Period (Days)",
y = "Spectral Density (Strength of Signal)"
) +
geom_vline(xintercept = 365, linetype = "dashed", color = "red") +
annotate("text", x = 365, y = max(spec_df$spec), label = "Annual Cycle", hjust = -0.1)
ggsave("four.png",height=5, width=8.39)
#Testing significance using the permutation test
abundance <- flav_abundance[, 2]
# 1. Center the data to remove the DC offset (mean)
abundance_centered <- abundance - mean(abundance)
# 2. Calculate observed peak (ignoring the first element just in case)
# We only look at the first half (Nyquist frequency) for real-valued signals
N <- length(abundance_centered)
obs_fft <- Mod(fft(abundance_centered))[2:(N/2 + 1)]
obs_peak <- max(obs_fft)
# 3. Permutation test
shuffled_peaks <- replicate(999, {
shuffled <- sample(abundance_centered)
# Repeat the same slicing logic
max(Mod(fft(shuffled))[2:(N/2 + 1)])
})
# 4. Calculate P-value (adding 1 to numerator/denominator for unbiased estimate)
p_val <- (sum(shuffled_peaks >= obs_peak) + 1) / (length(shuffled_peaks) + 1)
print(p_val)
NMDS plot
bray_dist <- vegdist(asv_abundance, method = "bray")
ITS_nmds <- metaMDS(bray_dist, k = 2, trymax = 100, autotransform = FALSE)
stress_val <- round(ITS_nmds$stress, 3)
nmds_coords <- as.data.frame(ITS_nmds$points)
nmds_coords$Sample <- asv_dat[,1]
nmds_coords <- nmds_coords %>%
left_join(env_dat, by = c("Sample"))
season_order <- c("spring", "summer", "autumn", "winter")
season_labels <- c("Spring", "Summer", "Autumn", "Winter")
fit <- envfit(ITS_nmds, merged_env, permutations = 999, na.rm = TRUE)
arrow_coords <- vegan::scores(fit, "vectors")
arrow_dat <- as.data.frame(arrow_coords)
# Add stats
arrow_dat$pvals <- fit$vectors$pvals
arrow_dat$rsq <- fit$vectors$r
# Filter to top 4 significant variables
arrow_dat <- arrow_dat[arrow_dat$pvals < 0.05, ]
arrow_dat <- arrow_dat[order(arrow_dat$rsq, decreasing = TRUE), ][1:4, ]
# MANUALLY assign the labels to a clean character column
# Ensure this is exactly 4 names long
arrow_dat$LabelName <- c( "Phosphate","Nitrate - Nitrite",
"Weekly Temperature", "Silicate")
nmds_plot <- ggplot(nmds_coords, aes(x = MDS1, y = MDS2)) +
stat_ellipse(aes(colour = season), type = "t", linetype = 2,
alpha = 0.6, show.legend = FALSE) +
geom_point(aes(colour = season, shape = season), size = 3, alpha = 0.7) +
geom_point(data=nmds_coords, aes(x = MDS1, y = MDS2, colour = season, shape = season)) +
geom_segment(data = arrow_dat,
aes(x = 0, y = 0, xend = NMDS1*2, yend = NMDS2*2),
arrow = arrow(length = unit(0.18,"cm")), color = "black",
inherit.aes = FALSE)+
geom_text_repel(data = arrow_dat,
aes(x = NMDS1 * 2.1,
y = NMDS2 * 2.1,
label = LabelName),
hjust = 1.1,
color = "black",
fontface = "bold",
size = 3,
family = "sans",
inherit.aes = FALSE)+
scale_color_manual(
name = "Season",
breaks = season_order,
labels = season_labels,
values = c("spring" = "cyan3", "summer" = "blue",
"autumn" = "dodgerblue4", "winter" = "orange")
) +
scale_shape_manual(
name = "Season",
breaks = season_order,
labels = season_labels,
values = c(17, 18, 16, 15)
) +
annotate("text", x = 1.8, y = 1.8,
label = paste("Stress:", round(stress_val, 3)),
hjust = 1, vjust = 1, fontface = "italic", size = 4) +
labs(x = "NMDS Dimension 1", y = "NMDS Dimension 2") +
coord_fixed(ratio = 0.8, xlim = c(-2.5, 2.2), ylim = c(-2.2, 2)) +
theme_bw() +
theme(
legend.position = "right",
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
)
print(nmds_plot)
ggsave("nmds_plot.png",height=6, width=8)
Temperature change
temp_over <- ggplot(merged_env,aes(x=Date,y=temp_C))+
geom_point(shape=21,alpha=0.4, fill = "grey70", color ="black",size=2.5, na.rm = TRUE)+
geom_hline(aes(yintercept = mean(temp_C,na.rm=TRUE)),
color="red",linetype="dashed",size=0.8)+
geom_smooth(color="royalblue",size=1.2,se=FALSE,na.rm = TRUE)+
labs(x="Years",y="Temperature (°C)")+
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")
)
print(temp_over)
ggsave("temp_over.png",height=5, width=8.39)
#temp stats
temp_mod <- lm(temp_C ~ Date, data = merged_env)
summary(temp_mod)
#estimate is change per day, need per year
0.0005776*365 #0.210824 (temp change per year)
#p value is significant