Number of Samples by Location

# Load Packages
library(tidyverse)
library(vegan)
library(viridis)
library(ggrepel)
theme_set(theme_bw() + theme(plot.caption = element_text(hjust = 0, size = 12),
                             plot.title = element_text(size = 14, face = "bold")))

# Read in data; code below to check parameters if needed
hs = read.csv("PhytoData_Hom_Sel_Expanded.csv")
# glimpse(hs)
# unique(hs$Species)
# unique(hs$Abundance)
# unique(hs$Site.ID)
# unique(hs$Group)

# Convert dates to date class
hs$Date = mdy(hs$SampleDate)
# class(hs$Date)

# Create month, year, day of year variables
hs$Month = fct_recode(as.factor(month(hs$Date)), Jan = "1", Feb = "2", Mar = "3", Apr = "4", May = "5", Jun = "6", Jul = "7", Aug = "8", Sep = "9", Oct = "10", Nov = "11", Dec = "12")
hs$Year = as.factor(year(hs$Date))
hs$Day.of.Yr = yday(hs$Date)

# Convert abundance categories to 1-3 scale (add 0s later for absent species)
hs$num.abund = rep(NA, nrow(hs))
hs$num.abund[hs$Abundance == "P"] = 1
hs$num.abund[hs$Abundance == "A"] = 2
hs$num.abund[hs$Abundance == "B"] = 3

# Create presence-absence column (all present for now)
hs$PA = rep(1, nrow(hs))

# Drop certain sites: after looking at how many samples we have, it's clear that 'Homer Spit West', Little Tutka, Little Jakolof, and North Hesketh just don't have enough samples to evaluate differences from other sites. Nick Dudiak Lagoon was intensively sampled May-July 2012 (20-30 samples per month), with some other sampling 2010-2018, so any differences in phytoplankton community from the Homer harbor/SWMP sites would skew the early data.
hs = hs %>% filter(!(Site.ID %in% c("HSW","NDL","LJB","LJC","LTH","LTM","NHI"))) %>%
  filter(!(Year %in% c("2010","2011","2012")))  # Per Rosie, sample identification was less detailed prior to 2013

hs = hs %>% drop_na(num.abund) # Dropped 11 observations with blank Abundance value

# Create N/S, Bay variables
hs$NS = fct_collapse(hs$Site.ID, 
                     N = c("HMH","HMS"),
                     S = c("SVB","SVS","SVH","SVM","KBE","KBL","JBB","JBD","JBR","EPE","EPW","EPY"))

hs$Bay = fct_collapse(hs$Site.ID,
                      HOM = c("HMH","HMS"),
                      SEL = c("SVB","SVS","SVH","SVM"),
                      KAS = c("KBE","KBL"),
                      JAK = c("JBB","JBD","JBR"),
                      EP = c("EPE","EPW","EPY"))

#### Filter to Homer, Seldovia, Jakolof, Kasitsna for final plots ####
hs = hs %>% filter(Bay %in% c("HOM","SEL","KAS","JAK"))

test = hs %>% group_by(Date, Site.ID, Species) %>% summarise(n = n()) # There is 1 date/site triplicate (Seldovia SWMP 6/9/14) and there are 32 duplicates. At least 4 pairs have inconsistent abundance values. Probably all from multiple samples in one day.
test = test %>% group_by(Date, Site.ID) %>% summarise(effort = max(n))

# Collapse Chaetoceros into one "species", make abundance the max chaetoceros abundance in the sample
hs = hs %>% mutate(Species = fct_collapse(hs$Species, "Chaetoceros spp." = c("Chaetoceros spp.","Chaetoceros socialis","Chaetoceros debilis","Chaetoceros concavicornis","Chaetoceros laciniosus"))) %>% group_by(Species, Group, Site.ID, NS, Bay, Date, Month, Year, Day.of.Yr, PA) %>%
  summarise(num.abund = max(num.abund)) %>%
  ungroup()

hs = hs %>% inner_join(test, by = c("Date","Site.ID")) %>%
  group_by(Species, Site.ID, NS, Bay, Group, Date, Month, Year, Day.of.Yr, PA, effort) %>% 
  summarise(num.abund = sum(num.abund),
            mean.abund = num.abund/effort) %>% 
  ungroup() # Sum and average abundances for duplicates.

# Create wide version for later analysis (using averages in the species table)
hs.wide = hs %>% select(-c(Group, mean.abund, num.abund)) %>% 
  pivot_wider(names_from = Species, values_from = PA, values_fill = 0)

# Filter full data to Apr-Sept, add "season" variable to test
hs.summer = hs %>% filter(Month %in% c("Apr","May","Jun","Jul","Aug","Sep"))
hs.wide.summer = hs.wide %>% filter(Month %in% c("Apr","May","Jun","Jul","Aug","Sep"))

# Get number of samples by bay/month/yr
hs.sp.rich.b = hs %>% group_by(Year, Month, Date, effort, Day.of.Yr, Bay) %>% summarise(species = n())
hs.by.mon.b = hs.sp.rich.b %>% group_by(Year, Month, Bay) %>% summarise(samples = sum(effort))

# Plot number of samples by month/year
ggplot(hs.by.mon.b) + 
  geom_tile(aes(x = Month, y = Year, fill = samples)) +
  scale_fill_viridis_c() +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(fill = "gray90")) +
  facet_wrap(~Bay)

Species Abundance Heat Maps

By Day of Year

# Define common species (came from wider analysis)
common.spp = c("Chaetoceros spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Leptocylindrus spp.","Thalassionema spp.","Protoperidinium spp.","Coscinodiscus spp.","Navicula morphotype","Pleurosigma morphotype","Rhizosolenia spp.","Skeletonema spp.","Dictyocha spp. (Chromophyte)")

# sort(unique(hs.summer$Day.of.Yr))   # missing 8 days now that we just have HOM and SEL

doy = hs.summer %>% 
  select(-c(PA, num.abund)) %>%
  filter(Species %in% common.spp) %>%
  pivot_wider(names_from = Species, values_from = mean.abund, values_fill = 0) %>%
  group_by(Day.of.Yr) %>% 
  summarise(across(9:(length(common.spp) + 8), mean))

# Add data for missing dates (3 of them) - average average abundance day before and after for each species
for (d in c(97,117,272)){
  means = apply(doy %>% filter(Day.of.Yr %in% c(d-1,d,d+1)) %>% select(!c("Day.of.Yr")),2,mean)
  doy = rbind(doy, cbind(Day.of.Yr = d, t(means)))
}

spp1 = colnames(doy)
colnames(doy) = make.names(colnames(doy))

spp = colnames(doy)
for (sp in 2:(length(common.spp) + 1)){
  interp = loess(formula = as.formula(paste(spp[sp], "~", "Day.of.Yr")), data = doy, span = 0.5)
  doy[,sp] = interp$fitted
}

ggplot(doy %>% pivot_longer(2:(length(common.spp) + 1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Day.of.Yr, y = reorder(Species, Mean.Abundance, desc = TRUE), fill = Mean.Abundance)) +
  scale_fill_gradient(low = "purple4",high = "green") +
  labs(x = "Day of Year", y = "Species", title = "Seasonal Change (Apr-Sept)", fill = "Mean Abundance") +
  scale_x_continuous(expand = c(0,0)) + 
  scale_y_discrete(expand = c(0,0), labels = c("Dictyocha spp.","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp."))

By Year

y.avg = hs.summer %>%
  select(-c(PA, num.abund)) %>%
  filter(Species %in% common.spp) %>%
  pivot_wider(names_from = Species, values_from = mean.abund, values_fill = 0) %>%
  group_by(Year) %>%
  summarise(across(9:(length(common.spp)+8), mean))

colnames(y.avg) = make.names(colnames(y.avg))
y.avg$Year = as.numeric(as.character(y.avg$Year))

spp = colnames(y.avg)
for (sp in 2:(length(common.spp) + 1)){
  interp = loess(formula = as.formula(paste(spp[sp], "~", "Year")), data = y.avg, span = 0.5)
  y.avg[,sp] = interp$fitted
}

# Plot species mean abundance by year
ggplot(y.avg %>% pivot_longer(2:(length(common.spp)+1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Year, y = factor(Species, levels = c("Dictyocha.spp...Chromophyte.","Rhizosolenia.spp.","Skeletonema.spp.","Coscinodiscus.spp.","Protoperidinium.spp.","Pleurosigma.morphotype","Navicula.morphotype","Thalassionema.spp.","Leptocylindrus.spp.","Pseudo.nitzschia.spp.","Thalassiosira.spp.","Chaetoceros.spp.")), fill = Mean.Abundance)) +
  scale_fill_viridis_c(option = "cividis") +
  labs(x = "Year", y = "Species", fill = "Mean Abundance", title = "Interannual Change (2013-2024)") +
  scale_x_continuous(expand = c(0,0), breaks = 2013:2024) +
  scale_y_discrete(expand = c(0,0), labels = c("Dictyocha spp.","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp.")) +
  theme(axis.text = element_text(size = 8))