Species Abundance Heat Maps

By Month

# Load Packages
library(tidyverse)
library(vegan)
library(viridis)
library(ggrepel)
# library(goeveg)
# library(plotly)
# library(knitr)
# library(factoextra)
theme_set(theme_bw() + theme(plot.caption = element_text(hjust = 0, size = 12)))

# Read in data; code below to check parameters if needed
hs = read.csv("PhytoData_Hom_Sel_Expanded.csv")
# glimpse(hs)
# unique(hs$Species)
# unique(hs$Abundance)
# unique(hs$Site.ID)
# unique(hs$Group)

# Convert dates to date class
hs$Date = mdy(hs$SampleDate)
# class(hs$Date)

# Create month, year, day of year variables
hs$Month = fct_recode(as.factor(month(hs$Date)), Jan = "1", Feb = "2", Mar = "3", Apr = "4", May = "5", Jun = "6", Jul = "7", Aug = "8", Sep = "9", Oct = "10", Nov = "11", Dec = "12")
hs$Year = as.factor(year(hs$Date))
hs$Day.of.Yr = yday(hs$Date)

# Convert abundance categories to 1-3 scale (add 0s later for absent species)
hs$num.abund = rep(NA, nrow(hs))
hs$num.abund[hs$Abundance == "P"] = 1
hs$num.abund[hs$Abundance == "A"] = 2
hs$num.abund[hs$Abundance == "B"] = 3

# Create presence-absence column (all present for now)
hs$PA = rep(1, nrow(hs))

# Drop certain sites: after looking at how many samples we have, it's clear that 'Homer Spit West', Little Tutka, Little Jakolof, and North Hesketh just don't have enough samples to evaluate differences from other sites. Nick Dudiak Lagoon was intensively sampled May-July 2012 (20-30 samples per month), with some other sampling 2010-2018, so any differences in phytoplankton community from the Homer harbor/SWMP sites would skew the early data.
hs = hs %>% filter(!(Site.ID %in% c("HSW","NDL","LJB","LJC","LTH","LTM","NHI"))) %>%
  filter(!(Year %in% c("2010","2011","2012")))  # Per Rosie, sample identification was less detailed prior to 2013

hs = hs %>% drop_na(num.abund) # Dropped 11 observations with blank Abundance value

# Create N/S, Bay variables
hs$NS = fct_collapse(hs$Site.ID, 
                     N = c("HMH","HMS"),
                     S = c("SVB","SVS","SVH","SVM","KBE","KBL","JBB","JBD","JBR","EPE","EPW","EPY"))

hs$Bay = fct_collapse(hs$Site.ID,
                      HOM = c("HMH","HMS"),
                      SEL = c("SVB","SVS","SVH","SVM"),
                      KAS = c("KBE","KBL"),
                      JAK = c("JBB","JBD","JBR"),
                      EP = c("EPE","EPW","EPY"))

test = hs %>% group_by(Date, Site.ID, Species) %>% summarise(n = n()) # There is 1 date/site triplicate (Seldovia SWMP 6/9/14) and there are 32 duplicates. At least 4 pairs have inconsistent abundance values. Probably all from multiple samples in one day.
test = test %>% group_by(Date, Site.ID) %>% summarise(effort = max(n))

# Collapse Chaetoceros into one "species", make abundance the max chaetoceros abundance in the sample
hs = hs %>% mutate(Species = fct_collapse(hs$Species, "Chaetoceros spp." = c("Chaetoceros spp.","Chaetoceros socialis","Chaetoceros debilis","Chaetoceros concavicornis","Chaetoceros laciniosus"))) %>% group_by(Species, Group, Site.ID, NS, Bay, Date, Month, Year, Day.of.Yr, PA) %>%
  summarise(num.abund = max(num.abund)) %>%
  ungroup()

hs = hs %>% inner_join(test, by = c("Date","Site.ID")) %>%
  group_by(Species, Site.ID, NS, Bay, Group, Date, Month, Year, Day.of.Yr, PA, effort) %>% 
  summarise(num.abund = sum(num.abund),
            mean.abund = num.abund/effort) %>% 
  ungroup() # Sum and average abundances for duplicates.

# Create wide version for later analysis (using averages in the species table)
hs.wide = hs %>% select(-c(Group, mean.abund, num.abund)) %>% 
  pivot_wider(names_from = Species, values_from = PA, values_fill = 0)

# Filter full data to Apr-Sept, add "season" variable to test
hs.summer = hs %>% filter(Month %in% c("Apr","May","Jun","Jul","Aug","Sep"))
hs.wide.summer = hs.wide %>% filter(Month %in% c("Apr","May","Jun","Jul","Aug","Sep"))

############################

common.spp = c("Chaetoceros spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Leptocylindrus spp.","Thalassionema spp.","Protoperidinium spp.","Coscinodiscus spp.","Navicula morphotype","Pleurosigma morphotype","Rhizosolenia spp.","Skeletonema spp.","Dictyocha spp. (Chromophyte)")

doy = hs.summer %>% 
  select(-c(PA, num.abund)) %>%
  filter(Species %in% common.spp) %>%
  pivot_wider(names_from = Species, values_from = mean.abund, values_fill = 0) %>%
  group_by(Day.of.Yr) %>% 
  summarise(across(9:(length(common.spp) + 8), mean))
mean.97 = cbind(data.frame(Day.of.Yr = 97), (doy[doy$Day.of.Yr == 96,2:(length(common.spp) + 1)] + doy[doy$Day.of.Yr == 98,2:(length(common.spp) + 1)])/2)
mean.272 = cbind(data.frame(Day.of.Yr = 272), (doy[doy$Day.of.Yr == 271,2:(length(common.spp) + 1)] + doy[doy$Day.of.Yr == 273,2:(length(common.spp) + 1)])/2)
doy = rbind(doy, mean.97, mean.272)
spp1 = colnames(doy)
colnames(doy) = make.names(colnames(doy))

ggplot(doy %>% pivot_longer(2:(length(common.spp) + 1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Day.of.Yr, y = reorder(Species, Mean.Abundance, desc = TRUE), fill = Mean.Abundance)) +
  scale_fill_viridis_c(option = "cividis") +
  labs(x = "Day of Year", y = "Species", fill = "Mean Abundance", caption = str_wrap("Average abundance by day of year for 12 common types of phytoplankton (spring through fall). Abundance is measured on a 0-3 scale, where 0 = absent, 1 = present, 2 = abundant, and 3 = blooming.")) +
  scale_x_continuous(expand = c(0,0), breaks = c(122,153,183,214,245), labels = c("May 1st","Jun 1st","Jul 1st","Aug 1st","Sep 1st")) +
  scale_y_discrete(expand = c(0,0), labels = c("Dictyocha spp. (Chromophyte)","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp."))

spp = colnames(doy)
for (sp in 2:(length(common.spp) + 1)){
  interp = loess(formula = as.formula(paste(spp[sp], "~", "Day.of.Yr")), data = doy, span = 0.5)
  doy[,sp] = interp$fitted
}

ggplot(doy %>% pivot_longer(2:(length(common.spp) + 1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Day.of.Yr, y = reorder(Species, Mean.Abundance, desc = TRUE), fill = Mean.Abundance)) +
  scale_fill_viridis_c(option = "cividis") +
  labs(x = "Day of Year", y = "Species", fill = "Mean Abundance",caption = str_wrap("Average abundance by day of year for 12 common types of phytoplankton (spring through fall), smoothed with local polynomial regression. Abundance is measured on a 0-3 scale, where 0 = absent, 1 = present, 2 = abundant, and 3 = blooming.")) +
  scale_x_continuous(expand = c(0,0), breaks = c(122,153,183,214,245), labels = c("May 1st","Jun 1st","Jul 1st","Aug 1st","Sep 1st")) +
  scale_y_discrete(expand = c(0,0), labels = c("Dictyocha spp. (Chromophyte)","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp."))

By Year

y.avg = hs.summer %>% 
  select(-c(PA, num.abund)) %>%
  filter(Species %in% common.spp) %>%
  pivot_wider(names_from = Species, values_from = mean.abund, values_fill = 0) %>%
  group_by(Year) %>% 
  summarise(across(9:(length(common.spp)+8), mean))

# Plot species mean abundance by year
ggplot(y.avg %>% pivot_longer(2:(length(common.spp)+1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Year, y = factor(Species, levels = c("Dictyocha spp. (Chromophyte)","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp.")), fill = Mean.Abundance)) +
  scale_fill_viridis_c(option = "cividis") +
  labs(x = "Year", y = "Species", fill = "Mean Abundance", caption = str_wrap("Average abundance by year for 12 common types of phytoplankton (spring through fall). Abundance is measured on a 0-3 scale, where 0 = absent, 1 = present, 2 = abundant, and 3 = blooming.")) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0))

colnames(y.avg) = make.names(colnames(y.avg))
y.avg$Year = as.numeric(as.character(y.avg$Year))

spp = colnames(y.avg)
for (sp in 2:(length(common.spp) + 1)){
  interp = loess(formula = as.formula(paste(spp[sp], "~", "Year")), data = y.avg, span = 0.5)
  y.avg[,sp] = interp$fitted
}

# Plot species mean abundance by year
ggplot(y.avg %>% pivot_longer(2:(length(common.spp)+1), names_to = "Species", values_to = "Mean.Abundance")) +
  geom_tile(aes(x = Year, y = factor(Species, levels = c("Dictyocha.spp...Chromophyte.","Rhizosolenia.spp.","Skeletonema.spp.","Coscinodiscus.spp.","Protoperidinium.spp.","Pleurosigma.morphotype","Navicula.morphotype","Thalassionema.spp.","Leptocylindrus.spp.","Pseudo.nitzschia.spp.","Thalassiosira.spp.","Chaetoceros.spp.")), fill = Mean.Abundance)) +
  scale_fill_viridis_c(option = "cividis") +
  labs(x = "Year", y = "Species", fill = "Mean Abundance", caption = str_wrap("Average abundance by year for 12 common types of phytoplankton (spring through fall), smoothed with local polynomial regression. Abundance is measured on a 0-3 scale, where 0 = absent, 1 = present, 2 = abundant, and 3 = blooming.")) +
  scale_x_continuous(expand = c(0,0), breaks = 2013:2024) +
  scale_y_discrete(expand = c(0,0), labels = c("Dictyocha spp. (Chromophyte)","Rhizosolenia spp.","Skeletonema spp.","Coscinodiscus spp.","Protoperidinium spp.","Pleurosigma morphotype","Navicula morphotype","Thalassionema spp.","Leptocylindrus spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Chaetoceros spp."))

nMDS Ordination Plots

mat.ns = as.matrix(hs.wide.summer[,9:71])

# Try taking out rare species:
sums = data.frame(sum = apply(mat.ns, 2, sum))
# arrange(sums, sum)

mat.common = as.data.frame(mat.ns) %>% select(where(function(x) sum(x) > 100))
# Reduced from 63 to 31 species.
zero_rows = which(apply(mat.common, 1, sum) == 0)
# zero_rows # 1 row of zeros introduced
mat.common = as.matrix(mat.common[-208,])
hs.wide.summer.c = hs.wide.summer[-208,]

# set.seed(1009) # make reproducible
# screeplot = dimcheckMDS(mat.common, distance = "bray", trymax = 5, k = 4) +  abline(h = 0.10, col = "blue", lty = 2 ) + 
#       abline(h = 0.05, col = "green", lty = 2)
# Stress about the same... 0.418 with 1 dimension, 0.270 with 2, 0.208 with 3, 0.170 with 4

set.seed(5762)  # make it reproducible
nMDS.ns = metaMDS(mat.common, trymax = 5, k = 3, distance = "bray")  # with this seed, a no solution converges within 500 tries; set trymax to 25 for knitted markdown to reduce runtime - will bump back up for final graph

## Run 0 stress 0.2083043 
## Run 1 stress 0.2083477 
## ... Procrustes: rmse 0.007116901  max resid 0.1140652 
## Run 2 stress 0.2084424 
## ... Procrustes: rmse 0.004059118  max resid 0.06570457 
## Run 3 stress 0.2143731 
## Run 4 stress 0.2084099 
## ... Procrustes: rmse 0.003362436  max resid 0.05575497 
## Run 5 stress 0.2081412 
## ... New best solution
## ... Procrustes: rmse 0.006963874  max resid 0.09136426 
## *** Best solution was not repeated -- monoMDS stopping criteria:
##      5: no. of iterations >= maxit

nMDS.ns$stress # 0.2079

## [1] 0.2081412

By Day of Year

# Put the nmds data into a table for plotting
nmds.data = data.frame(Year = hs.wide.summer.c$Year,  # make data frame with all the info for graphing
                       Year.Continuous = as.numeric(as.character(hs.wide.summer.c$Year)),  # Year as continuous
                       Month = hs.wide.summer.c$Month, 
                       Bay = hs.wide.summer.c$Bay,
                       NS = hs.wide.summer.c$NS,
                       Day.of.Yr = hs.wide.summer.c$Day.of.Yr,
                       nmds1 = nMDS.ns$points[,1], 
                       nmds2 = nMDS.ns$points[,2], 
                       nmds3 = nMDS.ns$points[,3])

# Regress factor data (site, month, year) onto nMDS
env = nmds.data[,c("NS","Bay","Month","Year","Day.of.Yr","Year.Continuous")]  # make data frame of factors to test
envfit.hs = envfit(nmds.data[,c("nmds1","nmds2","nmds3")], env, choices = c(1:4))  # run regression

# Create table of species centroids for plotting
species.centroids = data.frame(Species = rownames(nMDS.ns$species), # make data frame for species centroids
                               nmds1 = nMDS.ns$species[,1], 
                               nmds2 = nMDS.ns$species[,2], 
                               nmds3 = nMDS.ns$species[,3])
common.spp.centroids = species.centroids %>%
  filter(Species %in% c("Chaetoceros spp.","Pseudo-nitzschia spp.","Thalassiosira spp.","Leptocylindrus spp.","Thalassionema spp.","Protoperidinium spp.","Coscinodiscus spp.","Navicula morphotype","Pleurosigma morphotype","Rhizosolenia spp.","Skeletonema spp.","Dictyocha spp. (Chromophyte)"))

# Create table of North-South centroids
ns.cent = as.data.frame(envfit.hs$factors[[1]][1:2,1:3])
ns.cent$NS = rownames(ns.cent)
# Create table of Bay/Water Body Centroids
bay.cent = as.data.frame(envfit.hs$factors[[1]][3:7,1:3])
bay.cent$bay = rownames(bay.cent)
# Create table of month centroids
m.cent = as.data.frame(envfit.hs$factors[[1]][8:13,1:3])
m.cent$month = rownames(m.cent)
# Create table of year centroids
y.cent = as.data.frame(envfit.hs$factors[[1]][14:25,1:3])
y.cent$year = rownames(y.cent)
# Create table of vectors (year and day of year)
vec = as.data.frame(envfit.hs$vectors[[1]])


ggplot() +
  geom_point(data = nmds.data, mapping = aes(x = nmds1, y = nmds2, color = Day.of.Yr, shape = NS)) +
  scale_color_viridis_c() +
  geom_label_repel(data = common.spp.centroids, mapping = aes(x = nmds1, y = nmds2, label = Species), size = 3, fill = alpha(c("white"),0.7)) +
  geom_segment(data = vec, aes(x = 0, xend = nmds1, y = 0, yend = nmds2), color = "red3") +
  geom_label(data = vec, aes(x = nmds1, y = nmds2, label = rownames(vec)), color = "black", size = 3, fill = "pink") +
  coord_fixed(ratio = 1) +
  labs(shape = "North/South", color = "Day of Year", caption = str_wrap("nMDS ordination of common phytoplankton presence/absence in Kachemak Bay from April through September, using Bray-Curtis dissimilarities. Points are colored by day of year, and shapes show north vs. south side of the bay. Vectors for seasonal and interannual changes (day of year and year as a continuous variable) are also shown, as are centroids for the 12 most common species/genera."))

By Year

ggplot() +
  geom_point(data = nmds.data, mapping = aes(x = nmds1, y = nmds2, color = Year.Continuous, shape = NS)) +
  scale_color_viridis_c() +
  geom_label_repel(data = common.spp.centroids, mapping = aes(x = nmds1, y = nmds2, label = Species), size = 3, fill = alpha(c("white"),0.7)) +
  geom_segment(data = vec, aes(x = 0, xend = nmds1, y = 0, yend = nmds2), color = "red3") +
  geom_label(data = vec, aes(x = nmds1, y = nmds2, label = rownames(vec)), color = "black", size = 3, fill = "pink") +
  coord_fixed(ratio = 1) +
  labs(shape = "North/South", color = "Year", caption = str_wrap("nMDS ordination of common phytoplankton presence/absence in Kachemak Bay from April through September, using Bray-Curtis dissimilarities. Points are colored by year, and shapes show north vs. south side of the bay. Vectors for seasonal and interannual changes (day of year and year as a continuous variable) are also shown, as are centroids for the 12 most common species/genera."))

Phyto Data Visualization and Exploration

2025-01-17

Species Abundance Heat Maps

By Month

By Year

nMDS Ordination Plots

By Day of Year

By Year