Applied packages

library(tidyverse)
library(vegan)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(cowplot)
library(scales)
library(colorspace)
library(zoo)

Data set up

asv_dat<- read.csv("WEC_ASV_data.csv", row.names = 1) # treat as index/labels 
asv_dat<- asv_dat %>%
  t()%>% # transposes so that WEC is column name
  as.data.frame()%>% # rewrites transposed column as a dataframe (needed)
  rownames_to_column("Sample") 
asv_dat <-asv_dat%>%
  mutate(Total_Reads=rowSums(select(.,starts_with(("WEC")))))
asv_table<-asv_dat%>%
  select(starts_with("WEC"))
env_dat<- read.csv("WEC_environmental_data.csv")
env_dat$season <- factor(env_dat$season,
                         levels = c("spring","summer","autumn","winter"))
env_dat$Date <- as.Date(env_dat$Date,"%Y-%m-%d")
taxa_dat<- read.csv("WEC_taxonomic_data.csv")

Alpha diversity

asv_abundance<-asv_table/rowSums(asv_table)
shannon_indices<- diversity(asv_abundance, index="shannon")
diversity_df<-data.frame(
  Sample=asv_dat[,1],
  Shannon_Index=shannon_indices
)

Single parameter analysis

merged_env<-env_dat%>%
  left_join(diversity_df,by="Sample")
env_means <- merged_env %>%
  summarise(across(
    .cols = c(Shannon_Index, NITRATE_NITRITE, PHOSPHATE, salinity_PSU, oxygen_uM, temp_C),
    .fns = ~mean(.x, na.rm = TRUE)
  ))
print(env_means)

env_totalmeans <- merged_env %>% 
  summarise(across(where(is.numeric), ~mean(.x, na.rm = TRUE)))
#In case for later calculations or justifications

#Temperature
temp <- ggplot(merged_env,aes(x=temp_C,y=Shannon_Index))+
  geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
  geom_smooth(method="lm",col="blue", na.rm = TRUE)+
  stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.9)+
  labs(x="Temperature (°C)",y="Alpha Diveristy")+
  geom_point(data = env_means, 
             aes(x = env_means$temp_C, y = env_means$Shannon_Index),
             color = "red", size = 4) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
ggsave("temp.png",height=5, width=8.39)

#Oxygen
oxy <- ggplot(merged_env,aes(x=oxygen_uM,y=Shannon_Index))+
  geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
  geom_smooth(method="lm",col="blue", na.rm = TRUE)+
  coord_cartesian(xlim = c(175, 350))+ # zooms in without erasing outlier
  labs(x="Oxygen (µM)",y="Alpha Diveristy")+
  stat_poly_eq(use_label(c("R2")), label.x = 0.05, label.y = 0.1)+
  geom_point(data = env_means, 
             aes(x = env_means$oxygen_uM, y = env_means$Shannon_Index), 
             color = "red", size = 4) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
ggsave("oxy.png",height=5, width=8.39)

#Salinity
sal <- ggplot(merged_env,aes(x=salinity_PSU,y=Shannon_Index))+
  geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
  geom_smooth(method="lm",col="blue", na.rm=TRUE)+
  stat_poly_eq(use_label(c("R2")), label.x = 0.2, label.y = 0.1)+
  labs(x="Salinity (PSU)",y="Alpha Diveristy")+
  geom_point(data = env_means, 
             aes(x = env_means$salinity_PSU, y = env_means$Shannon_Index), 
             color = "red", size = 4) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
ggsave("sal.png",height=5, width=8.39)

#Phosphate 
pho <- ggplot(merged_env,aes(x=PHOSPHATE,y=Shannon_Index))+
  geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
  geom_smooth(method="lm",col="blue", na.rm=TRUE)+
  stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.1)+
  labs(x="Phosphate (µM)",y="Alpha Diveristy")+
  geom_point(data = env_means, 
             aes(x = env_means$PHOSPHATE, y = env_means$Shannon_Index), 
             color = "red", size = 4) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
ggsave("pho.png",height=5, width=8.39)

#Nitrate_Nitrite 
nit <- ggplot(merged_env,aes(x=NITRATE_NITRITE,y=Shannon_Index))+
  geom_point(shape=21,alpha=0.8,fill="grey",color="black",size=4, na.rm = TRUE)+
  geom_smooth(method="lm",col="blue", na.rm=TRUE)+
  labs(x="Nitrate - Nitrite (µM)",y="Alpha Diveristy")+
  stat_poly_eq(use_label(c("R2")), label.x = 0.9, label.y = 0.1)+
  geom_point(data = env_means, 
             aes(x = env_means$NITRATE_NITRITE, y = env_means$Shannon_Index), 
             color = "red", size = 4) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
ggsave("nit.png",height=5, width=8.39)

nit_lm <- lm(Shannon_Index ~ NITRATE_NITRITE, data = merged_env)
summary(nit_lm)

Multiple regression

mod1 <- lm(Shannon_Index ~ AMMONIA + SILICATE +
             NITRATE_NITRITE, data = merged_env)
mod2 <- lm(Shannon_Index ~ AMMONIA + SILICATE +
             NITRATE_NITRITE + PHOSPHATE, data = merged_env)
summary(mod1)
#Adjusted R2 = 0.4644 (46% of the variance can be explained by x, y, and z)
#Better than a simple linear model as just Nitrate/Nitrite = 0.39
#R2 needs to be adjusted as more variables (even weakly associated) increases R2
summary(mod1)$coefficient
# t value = is there a significant association between the predictor and outcome variable
# coefficient = average effect on y of a one unit increase in predictor
confint(mod1)
AIC(mod1)
#Even though AIC is a tad higher than mod2, it is better

sigma(mod1)/mean(merged_env$Shannon_Index)
#The RSE(Residual Standard Error) is 0.07 (7% error rate)
# Highly precise and reliable statistical estimate

used_indices <-names(mod1$residuals)
merged_env_clean <- merged_env[used_indices,]
merged_env_clean$Predicted <- predict(mod1)

stats_label <- "AIC: 107 ~~ italic(R)^2: 0.46"

#Title: "Observed vs. Predicted Alpha Diversity Over Time"
multi_reg <- ggplot(merged_env_clean, aes(x = Date)) +
  geom_point(aes(y = Shannon_Index), alpha = 0.5, size = 1) + 
  geom_line(aes(y = Predicted), color = "blue", size = 1) +
  labs(y = "Alpha Diversity (Shannon Index)", 
       x = "Year")+
  theme_classic()
print(multi_reg)
multi_reg + annotate(
  "text",
  x = Inf, y = Inf,
  hjust = 1.1, vjust = 1.1,
  label = stats_label,
  parse = TRUE
)

ggsave("multi_reg.png", height=5, width=8.39)

Seasonal variation of Flavobacteriaceae family

flav_id <- taxa_dat %>%
  filter(Family == "Flavobacteriaceae") %>% # use this to continue the tidyverse
  pull(ASV)

flav_abundance <- asv_dat%>%
  mutate(Flav_Reads= rowSums(select(., any_of(flav_id))))%>%
  mutate(Flav_RelAbund=(Flav_Reads/Total_Reads)*100)%>%
  select(Sample,Flav_RelAbund)

#need to transpose the asv data again 
flav_matrix <- asv_dat %>%
  select(any_of(flav_id))

flav_diversity <- data.frame(
  Sample = asv_dat$Sample,
  Richness = specnumber(flav_matrix),
  Shannon = diversity(flav_matrix, index="shannon")
)

flav_results <- env_dat %>%
  left_join(flav_abundance, by = "Sample")%>%
  left_join(flav_diversity, by = "Sample")%>%
  arrange(Date)
str(flav_results)

facet_names <- c(
  "Shannon" = "Alpha diversity",
  "Flav_RelAbund" = "Relative Abundance (%)"
)

season_var <- flav_results %>%
  select(Date, Shannon, Flav_RelAbund, season) %>%
  pivot_longer(cols = c(Shannon, Flav_RelAbund), 
               names_to = "Metric", values_to = "Value") %>%
  ggplot(aes(x = Date, y = Value, color = season)) +
  geom_point(alpha = 0.5) +
  geom_smooth(color = "black", size = 0.8, se = FALSE) +
  facet_wrap(~Metric, scales = "free_y", ncol = 1, 
             labeller = as_labeller(facet_names)) + 
  theme_bw() +
  theme(
    panel.grid = element_blank()
    )+
  labs(
    x = "Date",
    y = "Value",            
    color = "Season"
  ) +
  scale_color_manual(values = c("autumn"="dodgerblue4","spring"="cyan3", 
                                "summer"="blue","winter"="orange"),
                     labels = c("autumn" = "Autumn", "spring" = "Spring", 
                                  "summer" = "Summer", "winter" = "Winter"))
ggsave("season_var.png",height=5, width=8.39)

season_aov <- aov(Shannon ~ season, data = flav_results)
summary(season_aov)
# Significant p < 0.001
season_tukey <- TukeyHSD(season_aov)
print(season_tukey)
shapiro.test(season_aov$residuals)
# Passed normality test as p value is not significant

Fourier analysis

ts_data <- flav_results %>%
  group_by(Date) %>%
  summarise(RelAbund = mean(Flav_RelAbund, na.rm = TRUE)) %>%
  arrange(Date)

all_dates <- data.frame(
  Date = seq(min(ts_data$Date), max(ts_data$Date), by = "7 days")
)

ts_regular <- all_dates %>%
  left_join(ts_data, by = "Date") %>%
  mutate(RelAbund_Interp = na.approx(RelAbund))

# runs the Periodogram
flux_spec <- spec.pgram(ts_regular$RelAbund_Interp - mean(ts_regular$RelAbund_Interp), 
                        taper = 0, 
                        span = c(3, 3), # A common stable smoothing setting
                        plot = FALSE)

# convert frequency to a time scale (Days)
# interval is 7 days
spec_df <- data.frame(
  freq = flux_spec$freq,
  spec = flux_spec$spec,
  period_days = (1 / flux_spec$freq) * 7
)

four <- ggplot(spec_df, aes(x = period_days, y = spec)) +
  geom_line(color = "steelblue") +
  scale_x_continuous(limits = c(0, 500), breaks = c(7, 30, 90, 180, 365)) +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.line.x = element_line(color = "black"),
    axis.line.y = element_line(color = "black"))+
  labs(
    x = "Period (Days)",
    y = "Spectral Density (Strength of Signal)"
  ) +
  geom_vline(xintercept = 365, linetype = "dashed", color = "red") +
  annotate("text", x = 365, y = max(spec_df$spec), label = "Annual Cycle", hjust = -0.1)
ggsave("four.png",height=5, width=8.39)

#Testing significance using the permutation test

abundance <- flav_abundance[, 2]

# 1. Center the data to remove the DC offset (mean)
abundance_centered <- abundance - mean(abundance)

# 2. Calculate observed peak (ignoring the first element just in case)
# We only look at the first half (Nyquist frequency) for real-valued signals
N <- length(abundance_centered)
obs_fft <- Mod(fft(abundance_centered))[2:(N/2 + 1)] 
obs_peak <- max(obs_fft)

# 3. Permutation test
shuffled_peaks <- replicate(999, {
  shuffled <- sample(abundance_centered)
  # Repeat the same slicing logic
  max(Mod(fft(shuffled))[2:(N/2 + 1)])
})

# 4. Calculate P-value (adding 1 to numerator/denominator for unbiased estimate)
p_val <- (sum(shuffled_peaks >= obs_peak) + 1) / (length(shuffled_peaks) + 1)

print(p_val)

NMDS plot

bray_dist <- vegdist(asv_abundance, method = "bray")
ITS_nmds <- metaMDS(bray_dist, k = 2, trymax = 100, autotransform = FALSE)

stress_val <- round(ITS_nmds$stress, 3)
nmds_coords <- as.data.frame(ITS_nmds$points)
nmds_coords$Sample <- asv_dat[,1]
nmds_coords <- nmds_coords %>% 
  left_join(env_dat, by = c("Sample"))
season_order <- c("spring", "summer", "autumn", "winter")
season_labels <- c("Spring", "Summer", "Autumn", "Winter")

fit <- envfit(ITS_nmds, merged_env, permutations = 999, na.rm = TRUE)
arrow_coords <- vegan::scores(fit, "vectors")
arrow_dat <- as.data.frame(arrow_coords)

# Add stats
arrow_dat$pvals <- fit$vectors$pvals
arrow_dat$rsq <- fit$vectors$r

# Filter to top 4 significant variables
arrow_dat <- arrow_dat[arrow_dat$pvals < 0.05, ]
arrow_dat <- arrow_dat[order(arrow_dat$rsq, decreasing = TRUE), ][1:4, ]

# MANUALLY assign the labels to a clean character column
# Ensure this is exactly 4 names long
arrow_dat$LabelName <- c( "Phosphate","Nitrate - Nitrite",
                          "Weekly Temperature", "Silicate")


nmds_plot <- ggplot(nmds_coords, aes(x = MDS1, y = MDS2)) +
  stat_ellipse(aes(colour = season), type = "t", linetype = 2,
               alpha = 0.6, show.legend = FALSE) +
  geom_point(aes(colour = season, shape = season), size = 3, alpha = 0.7) +
  geom_point(data=nmds_coords, aes(x = MDS1, y = MDS2, colour = season, shape = season)) +
  geom_segment(data = arrow_dat,
               aes(x = 0, y = 0, xend = NMDS1*2, yend = NMDS2*2),
               arrow = arrow(length = unit(0.18,"cm")), color = "black",
               inherit.aes = FALSE)+
  geom_text_repel(data = arrow_dat,
                  aes(x = NMDS1 * 2.1, 
                      y = NMDS2 * 2.1, 
                      label = LabelName),
                  hjust = 1.1,
                  color = "black", 
                  fontface = "bold",
                  size = 3, 
                  family = "sans",      
                  inherit.aes = FALSE)+
  scale_color_manual(
    name = "Season",
    breaks = season_order,
    labels = season_labels,
    values = c("spring" = "cyan3", "summer" = "blue",
               "autumn" = "dodgerblue4", "winter" = "orange")
  ) +
  scale_shape_manual(
    name = "Season",
    breaks = season_order,
    labels = season_labels,
    values = c(17, 18, 16, 15)
  ) +
  annotate("text", x = 1.8, y = 1.8, 
           label = paste("Stress:", round(stress_val, 3)), 
           hjust = 1, vjust = 1, fontface = "italic", size = 4) +
  labs(x = "NMDS Dimension 1", y = "NMDS Dimension 2") +
  coord_fixed(ratio = 0.8, xlim = c(-2.5, 2.2), ylim = c(-2.2, 2)) +
  theme_bw() +
  theme(
    legend.position = "right",
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.background = element_blank(),
    legend.key = element_blank()
  )
print(nmds_plot) 
ggsave("nmds_plot.png",height=6, width=8)

Temperature change

temp_over <- ggplot(merged_env,aes(x=Date,y=temp_C))+
  geom_point(shape=21,alpha=0.4, fill = "grey70", color ="black",size=2.5, na.rm = TRUE)+
  geom_hline(aes(yintercept = mean(temp_C,na.rm=TRUE)),
             color="red",linetype="dashed",size=0.8)+
  geom_smooth(color="royalblue",size=1.2,se=FALSE,na.rm = TRUE)+
  labs(x="Years",y="Temperature (°C)")+
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )
print(temp_over)
ggsave("temp_over.png",height=5, width=8.39)

#temp stats
temp_mod <- lm(temp_C ~ Date, data = merged_env)
summary(temp_mod)
#estimate is change per day, need per year
0.0005776*365 #0.210824 (temp change per year)
#p value is significant