library(RODBC)
library(reshape2)
library(plyr)
library(lubridate)
library(dplyr)
library(pander)
library(ggplot2)
library(dplyr)
library(knitr)
library(tidyr)

setwd("\\\\psf/Home/Dropbox/MysticDB")
source("./Rcode/Sandbox/Jeff/wq-precip-scripts/load_wq.R")
wq <- load_wq()
source('./Rcode/Sandbox/Jeff/wq-precip-scripts/load_precip.R')
precip <- load_precip()
wq2 <- append_weather(wq,precip)

Different methods of smoothing

I experimented with different ways of smoothing historical bacteria data to look for trends and patterns. This is to supplement recent work summarizing boating standard exceedances.

First, I plotted each site’s bacterial data, as a noisy time series, on a log scale; the two reference lines are boating and swimming standards:

df_ECOLI <- wq2 %>% 
  select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID, 
         WaterBodyID, WaterType, Latitude, Longitude, Weather) %>%
  filter(CharacteristicID %in% c("ECOLI"), 
         ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
         WaterType == "Fresh", LocationID == "ALB006")

bac <- ggplot(df_ECOLI, aes(Datetime, log10(ResultValue))) 
bac <- bac + 
  geom_line(color = "gray72", size = .3) + theme_bw() + 
  geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
  geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
  theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_line(linetype = "dotted"))
bac 

Then I added the default ggplot “loess” smoothing function (+stat_smooth()). It’s not transparent how it’s calculated. It’s a complex algorithm.

To experiment with more intuitive smoothing functions, I considered moving average and a moving geometric mean calculation.

First, moving average. If you plot the log of a moving average, you get strange results. Rare extremely high values pull the moving average up and keep it elevated. This is a moving average of the 24 time-adjacent samples.

#For filter function see:
# http://www.cookbook-r.com/Manipulating_data/Calculating_a_moving_average/

df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
  arrange(Datetime) %>%
  mutate(mgm = stats::filter(ResultValue, rep(1/24, 24), sides =2))

bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue))) 
bac <- bac + 
  stat_smooth() + 
  geom_line(color = "gray72", size = .3) + theme_bw() + 
  geom_line(data = df_ECOLI2, aes(Datetime, log10(mgm), color = WaterBodyID), size = 1) +
  geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
  geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
  theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_line(linetype = "dotted"))
  bac 

But if you plot moving average of the log of ResultValue, you get a result that closely approximates the loess smoothing function. The average of log values is the calculation behind geometric mean. So this can be thought of as a moving geometric mean.

df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
  arrange(Datetime) %>%
  mutate(logres = log10(ResultValue)) %>%
  mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))

bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue))) 
bac <- bac + 
  stat_smooth() + 
  geom_line(color = "gray72", size = .3) + theme_bw() + 
  #facet_wrap(~ LocationID, ncol = 5) +
  geom_line(data = df_ECOLI2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
  geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
  geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
  theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_line(linetype = "dotted"))
  bac 

So I applied this to all sites, for ECOLI and ENT, and colored by WaterBody.

E. coli data 2000-2014

Log of bacteria value averaged over 24 time-adjacent samples, amounting to two year average in baseline data.

df_ECOLI <- wq2 %>% 
  select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID, 
         WaterBodyID, WaterType, Latitude, Longitude, Weather) %>%
  filter(CharacteristicID %in% c("ECOLI"), 
         ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
         WaterType == "Fresh")

df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
  arrange(Datetime) %>%
  mutate(logres = log10(ResultValue)) %>%
  mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))

bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue))) 
bac <- bac + 
  stat_smooth() + 
  geom_line(color = "gray72", size = .3) + theme_bw() + 
  facet_wrap(~ LocationID, ncol = 4) +
  geom_line(data = df_ECOLI2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
  geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
  geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
  theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_line(linetype = "dotted")) +
  labs(x = "Date", y = "log of *E. coli* MPN / 100 ml")
bac 

ENT data 1989-2014

Log of bacteria value averaged over 24 time-adjacent samples

df_ENT <- wq2 %>% 
  select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID, 
         WaterBodyID, WaterType, Latitude, Longitude) %>%
  filter(CharacteristicID %in% c("ENT"), 
         ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
         WaterType == "Fresh", LocationID != "MAR036", LocationID != "ALB006")

#create a column for log of ResultValue, and then create a column "mgm" that is the average of the 
# 24 time-adjacent values, a "moving geometric mean" 

df_ENT2 <- group_by(df_ENT, LocationID) %>%
  arrange(Datetime) %>%
  mutate(logres = log10(ResultValue)) %>%
  mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))

# plot log of result values, in background, then moving geometric mean on top, along with 
# dplyr default smoothing function.  Add
# boating and swimming thresholds for reference

bac <- ggplot(df_ENT2, aes(Datetime, log10(ResultValue))) 
bac <- bac + stat_smooth() + geom_line(color = "gray82") + theme_bw() + 
  facet_wrap(~ LocationID, ncol = 4) +
  geom_line(data = df_ENT2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
  geom_hline(yintercept = 2.017, color = "black", size = .5, show_guide = TRUE) +
  geom_hline(yintercept = 2.544, color = "black", size = .5, linetype = "longdash") +
  theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted"),
        panel.grid.major.x = element_line()) +
  labs(x = "Date", y = "log of *Enterococcus* MPN / 100 ml")
bac

Conclusions

These patterns confirm several patterns we saw in the data we summarized recently in terms of boating exceedances in 5-year periods.