library(RODBC)
library(reshape2)
library(plyr)
library(lubridate)
library(dplyr)
library(pander)
library(ggplot2)
library(dplyr)
library(knitr)
library(tidyr)
setwd("\\\\psf/Home/Dropbox/MysticDB")
source("./Rcode/Sandbox/Jeff/wq-precip-scripts/load_wq.R")
wq <- load_wq()
source('./Rcode/Sandbox/Jeff/wq-precip-scripts/load_precip.R')
precip <- load_precip()
wq2 <- append_weather(wq,precip)
I experimented with different ways of smoothing historical bacteria data to look for trends and patterns. This is to supplement recent work summarizing boating standard exceedances.
First, I plotted each site’s bacterial data, as a noisy time series, on a log scale; the two reference lines are boating and swimming standards:
df_ECOLI <- wq2 %>%
select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID,
WaterBodyID, WaterType, Latitude, Longitude, Weather) %>%
filter(CharacteristicID %in% c("ECOLI"),
ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
WaterType == "Fresh", LocationID == "ALB006")
bac <- ggplot(df_ECOLI, aes(Datetime, log10(ResultValue)))
bac <- bac +
geom_line(color = "gray72", size = .3) + theme_bw() +
geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dotted"))
bac
Then I added the default ggplot “loess” smoothing function (+stat_smooth()). It’s not transparent how it’s calculated. It’s a complex algorithm.
To experiment with more intuitive smoothing functions, I considered moving average and a moving geometric mean calculation.
First, moving average. If you plot the log of a moving average, you get strange results. Rare extremely high values pull the moving average up and keep it elevated. This is a moving average of the 24 time-adjacent samples.
#For filter function see:
# http://www.cookbook-r.com/Manipulating_data/Calculating_a_moving_average/
df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
arrange(Datetime) %>%
mutate(mgm = stats::filter(ResultValue, rep(1/24, 24), sides =2))
bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue)))
bac <- bac +
stat_smooth() +
geom_line(color = "gray72", size = .3) + theme_bw() +
geom_line(data = df_ECOLI2, aes(Datetime, log10(mgm), color = WaterBodyID), size = 1) +
geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dotted"))
bac
But if you plot moving average of the log of ResultValue, you get a result that closely approximates the loess smoothing function. The average of log values is the calculation behind geometric mean. So this can be thought of as a moving geometric mean.
df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
arrange(Datetime) %>%
mutate(logres = log10(ResultValue)) %>%
mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))
bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue)))
bac <- bac +
stat_smooth() +
geom_line(color = "gray72", size = .3) + theme_bw() +
#facet_wrap(~ LocationID, ncol = 5) +
geom_line(data = df_ECOLI2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dotted"))
bac
So I applied this to all sites, for ECOLI and ENT, and colored by WaterBody.
Log of bacteria value averaged over 24 time-adjacent samples, amounting to two year average in baseline data.
df_ECOLI <- wq2 %>%
select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID,
WaterBodyID, WaterType, Latitude, Longitude, Weather) %>%
filter(CharacteristicID %in% c("ECOLI"),
ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
WaterType == "Fresh")
df_ECOLI2 <- group_by(df_ECOLI, LocationID) %>%
arrange(Datetime) %>%
mutate(logres = log10(ResultValue)) %>%
mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))
bac <- ggplot(df_ECOLI2, aes(Datetime, log10(ResultValue)))
bac <- bac +
stat_smooth() +
geom_line(color = "gray72", size = .3) + theme_bw() +
facet_wrap(~ LocationID, ncol = 4) +
geom_line(data = df_ECOLI2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
geom_hline(yintercept = 2.37, color = "black", size = .5, show_guide = TRUE) +
geom_hline(yintercept = 3.1, color = "black", size = .5, linetype = "longdash") +
theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dotted")) +
labs(x = "Date", y = "log of *E. coli* MPN / 100 ml")
bac
Log of bacteria value averaged over 24 time-adjacent samples
df_ENT <- wq2 %>%
select(Datetime, LocationID, CharacteristicID, ResultValue, Units, ProjectID,
WaterBodyID, WaterType, Latitude, Longitude) %>%
filter(CharacteristicID %in% c("ENT"),
ProjectID %in% c("BASE", "BHWQM", "CSORWM"),
WaterType == "Fresh", LocationID != "MAR036", LocationID != "ALB006")
#create a column for log of ResultValue, and then create a column "mgm" that is the average of the
# 24 time-adjacent values, a "moving geometric mean"
df_ENT2 <- group_by(df_ENT, LocationID) %>%
arrange(Datetime) %>%
mutate(logres = log10(ResultValue)) %>%
mutate(mgm = stats::filter(logres, rep(1/24, 24), sides =2))
# plot log of result values, in background, then moving geometric mean on top, along with
# dplyr default smoothing function. Add
# boating and swimming thresholds for reference
bac <- ggplot(df_ENT2, aes(Datetime, log10(ResultValue)))
bac <- bac + stat_smooth() + geom_line(color = "gray82") + theme_bw() +
facet_wrap(~ LocationID, ncol = 4) +
geom_line(data = df_ENT2, aes(Datetime, mgm, color = WaterBodyID), size = 1) +
geom_hline(yintercept = 2.017, color = "black", size = .5, show_guide = TRUE) +
geom_hline(yintercept = 2.544, color = "black", size = .5, linetype = "longdash") +
theme(plot.background = element_rect(color = NA), panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(linetype = "dotted"),
panel.grid.major.x = element_line()) +
labs(x = "Date", y = "log of *Enterococcus* MPN / 100 ml")
bac
These patterns confirm several patterns we saw in the data we summarized recently in terms of boating exceedances in 5-year periods.