Load libraries

library(here)
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(forecast)
library(nlme)
library(MuMIn)

bst <- read.csv(here("Week_13", "Data", "bacteria_size_temperature_edit.csv")) %>%
  filter(!is.na(LNA.ab.uml)) #remove NAs
bst$date <- dmy(bst$date) 

Exploratory Plots

Begin by exploring seasonal patterns in LNA abundance, cell size, biomass, and percent biomass, as well as temperature. I.e., make exploratory plots of these variables that focus on seasonality. What have you learned so far?

LNA Abundance and season

ggplot(bst, aes(x = factor(season), y = LNA.ab.uml)) +
  geom_boxplot()

LNA abundance is highest season 3 and lowest season 1.

LNA cell size and season

ggplot(bst, aes(x = factor(season), y = LNA.bv)) +
  geom_boxplot()

LNA cell size is highest season 2 even though its abundance is 2nd to last.

LNA total biomass and season

ggplot(bst, aes(x = factor(season), y = LNA.B)) +
  geom_boxplot()

LNA total biomass and abundance is highest season 3.

Percent of total bacterial biomass and season

ggplot(bst, aes(x = factor(season), y = X.LNA.BB)) +
  geom_boxplot()

Temperature at 5m and season

ggplot(bst, aes(x = factor(season), y = temp.5.m.E2)) +
  geom_boxplot()

During the season 3 where LNA abundance is highest, the temperature at 5m depth is also the highest. Season 1 has the lowest abundance and lowest temperature.

Statistical models

  1. Let’s create statistical models to test for long-term trends, as well as relationships with temperature. Because the sampling is monthly, with only a couple gaps, we can treat the time series as one measured at discrete intervals. To keep track of the sample order you’ll need to construct a new column which numbers each sample in the order they were sampled. I.e., the column should look like 1, 2, 3, 4, etc., where the numbers correspond to the month in the time series. You can use the year and month columns to construct this. Use linear models to test for long term (linear) trends in the five variables: LNA abundance, LNA cell size, LNA biomass, LNA percent biomass, and temperature. For each of these models, assess whether the residuals show evidence of temporal autocorrelation. If autocorrelation is present, add a residual autocorrelation component to your model. Assess whether the model successfully accounted for autocorrelation. What have you learned from these models? Does accounting for autocorrelation change the results?
bst <- bst %>%
  arrange(year, month) %>%
  mutate(sample_number = row_number())

Linear Models

modlnaab = lm(LNA.ab.uml~ 1, data = bst)
modlnbv = lm(LNA.bv ~ 1, bst)
modlnab = lm(LNA.B ~ 1, bst)
modlnabb = lm(X.LNA.BB ~ 1, bst)
modtemp = lm(temp.5.m.E2 ~ 1, bst)

Residual ACF

modlnaab = lm(LNA.ab.uml~ 1, data = bst)

bst$resid.rainpredictor.raw = resid(modlnaab) 
ggAcf(bst$resid.rainpredictor.raw) + labs(title = 'raw residuals, no predictor and no autocorrelation for time')

modlnbv = lm(LNA.bv ~ 1, bst)

bst$resid.rainpredictor.raw = resid(modlnbv) 
ggAcf(bst$resid.rainpredictor.raw) + labs(title = 'raw residuals, no predictor and no autocorrelation for time')

modlnab = lm(LNA.B ~ 1, bst)

bst$resid.rainpredictor.raw = resid(modlnab) 
ggAcf(bst$resid.rainpredictor.raw) + labs(title = 'raw residuals, no predictor and no autocorrelation for time')

modlnabb = lm(X.LNA.BB ~ 1, bst)

bst$resid.rainpredictor.raw = resid(modlnabb) 
ggAcf(bst$resid.rainpredictor.raw) + labs(title = 'raw residuals, no predictor and no autocorrelation for time')

modtemp = lm(temp.5.m.E2 ~ 1, bst)

bst$resid.rainpredictor.raw = resid(modtemp) 
ggAcf(bst$resid.rainpredictor.raw) + labs(title = 'raw residuals, no predictor and no autocorrelation for time')

Statistical Models

modlnaabgls =  gls(LNA.ab.uml ~ 1, data = bst, correlation = corAR1(form =~ sample_number), method = "ML") 
modlnbvgls = gls(LNA.bv ~ 1, data = bst, correlation = corAR1(form =~ sample_number), method = "ML") 
modlnabgls = gls(LNA.B ~ 1, data = bst, correlation = corAR1(form =~ sample_number), method = "ML") 
modlnabbgls = gls(X.LNA.BB ~ 1, data = bst, correlation = corAR1(form =~ sample_number), method = "ML") 
modtempgls = gls(temp.5.m.E2 ~ 1, data = bst, correlation = corAR1(form =~ sample_number), method = "ML") 

Residual ACF

modlnaabgls

bst$resid.rainpredictor.raw = resid(modlnaabgls) 
ggAcf(bst$resid.rainpredictor.raw)

modlnbvgls

bst$resid.rainpredictor.raw = resid(modlnbvgls) 
ggAcf(bst$resid.rainpredictor.raw) 

modlnabgls

bst$resid.rainpredictor.raw = resid(modlnabgls) 
ggAcf(bst$resid.rainpredictor.raw) 

modlnabbgls

bst$resid.rainpredictor.raw = resid(modlnabbgls) 
ggAcf(bst$resid.rainpredictor.raw) 

modtempgls

bst$resid.rainpredictor.raw = resid(modtempgls) 
ggAcf(bst$resid.rainpredictor.raw) 

AICc

  1. Perform a similar set of analyses using temperature as the predictor, to test whether temperature can explain variation in the four LNA metrics, and whether autocorrelation needs to be accounted for when testing these relationships. Make appropriate plots of the results and discuss the magnitude of the relationships. Considering the whole set of analyses, what are your interpretations of the dynamics and drivers of LNA bacteria in this system?

modlnaab

AICc(modlnaab)
## [1] 3132.848
AICc(modlnaabgls)
## [1] 3121.856

modlnbv

AICc(modlnbv)
## [1] -822.4318
AICc(modlnbvgls)
## [1] -836.202

modlnab

AICc(modlnab)
## [1] 595.652
AICc(modlnabgls)
## [1] 586.03

modlnabb

AICc(modlnabb)
## [1] -177.2299
AICc(modlnabbgls)
## [1] -192.6451

modtemp

AICc(modtemp)
## [1] 586.0755
AICc(modtempgls)
## [1] 494.8632