Document last updated 2020-12-14 19:16:07 by Benjamin Meyer (ben@kenaiwatershed.org)
This draft document contains preliminary data explorations of 2019-2020 beach bacteria sampling data from the Kenai River Watershed.
Notes:
Attempted to source beach sampling data from EPA water quality portal but did not find it there. Sourced data instead from AWQMS files and other files found on KWF server.
2020 MST data is present in AWQMS files, 2019 MST data is not. Sourced 2019 MST data from “MST results 2019.xlsx”
In the future, request that SGS can provide sample results in .csv or .xlsx format in addition to the PDFs to eliminate potential for data entry or data management error.
# import known data from local server source
# specify column import names
vars <- as.character(c("Monitoring Location Name",
"Activity Latitude",
"Activity Longitude",
"Activity Medium",
"Activity Start Date",
"Activity Start Time",
"Characteristic Name",
"Result Analytical Method",
"Result Value",
"Result Value Units",
"Result Qualifier"))
# specify new column names
new_vars <- as.character(c("location",
"lat",
"long",
"medium",
"date",
"time",
"data_type",
"analytical_method",
"val",
"units",
"qualifier"))
# import 2019 AWQMS data
## used this excel file because it had the most recent modified date
## see notes in excel file; not sure if all available data is contained within here
bs19 <- read_excel("data/KWF_2019_Kenai-BEACH_AWQMS_forQAQC_MH101119.xlsx", sheet = "AWQMS Data Template", skip = 7) %>%
select(all_of(vars))
# import 2020 AWQMS data
## see notes in excel file; not sure if all avilable data is contained within here
bs20 <- read_excel("data/2020_KWF_Kenai-BEACH_AWQMS_SUBMITTED.xlsx", sheet = "RAW AWQMS Data Template", skip = 3) %>%
select(all_of(vars))
# combine both years
dat <- bind_rows(bs19,bs20)
# rename columns
colnames(dat) <- new_vars
## 2019 MST data absent from AWQMS file. So, import separately and format to join
## imported sheet was created manually on 12/8/20 from the sheet titled "Summary"
mst19 <- read_excel("data/MST results 2019.xlsx", sheet = "AWQMS_summary_format") %>%
select(-site_id)
# join 2019 MST data to overall data set
dat <- bind_rows(dat,mst19)
# tidy column formats
dat <- dat %>%
transform(time = hms::as_hms(time))
# replace "Gull_Gull-#" with "Gull_Gull-4"
# correct location names
dat <- dat %>%
mutate(analytical_method = str_replace_all(analytical_method,c("Gull_Gull-5" = "Gull_Gull-4",
"Gull_Gull-6" = "Gull_Gull-4"))) %>%
mutate(location = str_replace_all(location, c("North Kenai River Beach 4" = "North Kenai Beach 4")))
rm(bs19,bs20,mst19)
Notes post data read-in
Procedure:
# recreate table in tab "2019-2020_FinalReport_MST2020"
mst_tbl <- dat %>%
filter(data_type == "Microbial Source Tracking") %>%
arrange(location,date,analytical_method) %>%
select(date,location,analytical_method,val) %>%
pivot_wider(names_from = analytical_method, values_from = val) %>%
arrange(date,location)
colnames(mst_tbl) <- c("Date","Location","Dog Feces","Gull Feces","Human Feces")
# print
datatable(mst_tbl,
filter = 'top',
options = list(pageLength = 5, autoWidth = TRUE))
# prep table
mst_tbl <- dat %>%
filter(data_type == "Microbial Source Tracking") %>%
arrange(location,date,analytical_method) %>%
select(date,location,analytical_method,val) %>%
# replace results DNQ and ND with zero and convert to numeric
mutate(val = as.numeric(str_replace_all(val,c("DNQ" = "0", "ND" = "0"))))
# plot
mst_tbl %>%
ggplot(aes(as.factor(date),val, fill = analytical_method)) +
geom_col(position = "dodge") +
facet_grid(.~location, labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("Concentration (Copies/100 mL)") +
theme_bw() +
# modify legend
scale_fill_discrete(name="Source",
breaks=c("Dog_BacCan-UCD", "Gull_Gull-4", "Human_HF183"),
labels=c("Dog", "Gull", "Human")) +
# theme
theme(axis.text.x = element_text(vjust = .5, angle = 85),
strip.text.y = element_text(angle = 0)) +
col_theme
# save plot
ggsave("output/mst_kenai_beach_2019_2020.png", width = 12, height = 6)
# prep table
bac_tbl <- dat %>%
filter(data_type %in% c("Enterococci","Fecal Coliform")) %>%
arrange(location,date,data_type) %>%
select(date,location,data_type,val) %>%
# replace results DNQ and ND with zero and convert to numeric
mutate(val = as.numeric(str_replace_all(val,c("DNQ" = "0", "ND" = "0")))) %>%
# use day of year for x axis and year for facet grid
mutate(day = yday(date),
year = year(date))
# print
datatable(bac_tbl,
filter = 'top',
options = list(pageLength = 5, autoWidth = TRUE))
# plot 1
bac_tbl %>%
ggplot(aes(x = as.Date(paste(2014,strftime(date,format="%m-%d"),sep="-")),
y = val,
color = as.factor(year))) +
geom_jitter(size = 3) +
facet_grid(location~data_type, labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("Individual Sample Concentration (CFU/100 mL)") +
scale_color_discrete(name = "Year")+
# theme
theme_bw() +
theme(axis.text.x = element_text(vjust = 1),
strip.text.y = element_text(angle = 0)) +
points_theme
# save plot
ggsave("output/bacteria_conc_2019_2020_v1.png", width = 10, height = 10)
# plot 2, enterococci
bac_tbl %>%
filter(data_type == "Enterococci") %>%
# note; use of year for date format is "fake" placeholder
ggplot(aes(x = as.Date(paste(2014,strftime(date,format="%m-%d"),sep="-")),
y = val,
color = as.factor(year))) +
geom_jitter(size = 2) +
facet_grid(. ~ location, scales = "free_y", labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("Individual Sample\nConcentration (CFU/100 mL)") +
ylim(0,625) +
ylim(0,625) +
scale_color_discrete(name = "Year") +
# single-sample threshold
geom_hline(aes(yintercept = 130, linetype = "Raw seafood consumption\n(130 CFU/100 mL)"), color = "blue") +
scale_linetype_manual(name = "Threshold", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("blue")))) +
# overall theme
theme_bw() +
theme(axis.text.x = element_text(vjust = 1,hjust = 1, angle = 35),
strip.text.y = element_text(angle = 0)) +
ggtitle("Enterococci Concentrations") +
points_theme
# save plot
ggsave("output/enterococci_conc_2019_2020.png", width = 10, height = 4)
# plot 3, fecal coliform
bac_tbl %>%
filter(data_type == "Fecal Coliform") %>%
# note; use of year for date format is "fake" placeholder
ggplot(aes(x = as.Date(paste(2014,strftime(date,format="%m-%d"),sep="-")),
y = val,
color = as.factor(year))) +
geom_jitter(size = 2) +
facet_grid(. ~ location, scales = "free_y", labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("Individual Sample\nConcentration (CFU/100 mL)") +
scale_color_discrete(name = "Year") +
# single-sample thresholds
geom_hline(aes(yintercept = 31, linetype = "Raw Seafood Consumption\n(31 CFU/100 mL)"), color = "blue") +
geom_hline(aes(yintercept = 400, linetype = "Water recreation, secondary\n(400 CFU/100 mL)"), color = "red") +
scale_linetype_manual(name = "Threshold", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("blue","red")))) +
# overall theme
theme_bw() +
theme(axis.text.x = element_text(vjust = 1,hjust = 1, angle = 35),
strip.text.y = element_text(angle = 0)) +
points_theme +
ggtitle("Fecal Coliform Concentrations")
# save plot
ggsave("output/fecal_coliform_conc_2019_2020.png", width = 10, height = 4)
What percentage of individual sample values exceed thresholds for each site?
What percentage of 30-day geometric mean values exceed thresholds for each site?
General Notes
Threshold values
Enterococci:
Fecal coliform:
“Harvest for consumption of raw mollusks or other raw seafood: The geometric mean of samples may not exceed 14 fecal coliform/100 ml; and not more than 10% of the samples may exceed 31 CFU/100 mL for a membrane filtration test.”
“Water recreation, secondary: In a 30-day period, the geometric mean of samples may not exceed 200 fecal coliform/100 mL, and not more than 10% of the samples may exceed 400 fecal coliform/100 mL.”
Calculate 30-day rolling geometric mean values for Enterococci and Fecal Coliform
library(zoo)
library(psych)
dat_gm30 <- dat %>%
filter(data_type %in% c("Fecal Coliform","Enterococci")) %>%
mutate(year = year(date)) %>%
transform(val = as.numeric(val)) %>%
# use 50% of value if it has an "Undetected" qualifier
mutate(val = ifelse(is.na(qualifier),val,
ifelse(qualifier == "U",(val*.5),val))) %>%
group_by(data_type,location,year) %>%
arrange(year,data_type,location,date) %>%
# calculate 30 day rolling geometric means
mutate(gm30 = rollapplyr(val, 1:n() - findInterval(date - days(30), date),
geometric.mean, fill = NA, partial = T)) %>%
# mutate values from the first annual 30 days of sampling to NA. (We need 30 days for a 30-day rolling geometric mean value). However, the sampling dates do not always line up so that the first 4 weeks of observations is exactly 30 days. Here, the rolling window is set to 25 days so as not to "throw away" that data. What we really want is a geometric mean of the first four weekly observations
mutate(gm30 = as.numeric(ifelse(date < min(date + days(25)),"",gm30)))
Designate exceedence values
# create columns of exceedence occurences
dat_excd <- dat_gm30 %>%
# individual sample exceedences
mutate(indv_excd = case_when(
(val < 130 & data_type == "Enterococci") ~ "no_indv_excd",
(val >= 130 & data_type == "Enterococci") ~ "ec_excd_130",
(val < 31 & data_type == "Fecal Coliform") ~ "no_indv_excd",
(val >= 31 & val < 400 & data_type == "Fecal Coliform") ~ "fc_excd_31",
(val >= 400 & data_type == "Fecal Coliform") ~ "fc_excd_400")) %>%
# 30-day rolling geometric mean value exceedences
mutate(gm30_excd = case_when(
(gm30 < 35 & data_type == "Enterococci") ~ "no_gm_excd",
(gm30 >= 35 & data_type == "Enterococci") ~ "gm_ec_excd_35",
(gm30 < 14 & data_type == "Fecal Coliform") ~ "no_gm_excd",
(gm30 >= 14 & val < 200 & data_type == "Fecal Coliform") ~ "gm_fc_excd_14",
(gm30 >= 200 & data_type == "Fecal Coliform") ~ "gm_fc_excd_200"))
Calculate % exceedence frequency by site, year, and bacteria type for individual samples
library(janitor)
# prop table for individual exceedences
indv_excd_tbl <- dat_excd %>%
# remove observations with no value = NA
filter(!is.na(val)) %>%
select(location,data_type,year,indv_excd) %>%
group_by(location,data_type,year) %>%
count(indv_excd) %>%
spread(key = indv_excd, val = n) %>%
mutate_at(vars(contains("excd")),funs(as.numeric)) %>%
mutate(across(everything(), ~replace_na(.x, 0))) %>%
# sum across columns
# was unable to find a way to successfully use a function to sum across columns here; used manual addition...
mutate(sample_n = ec_excd_130 + fc_excd_31 + fc_excd_400 + no_indv_excd) %>%
# calculate % of sample ct exceedence
mutate(ec_excd_130 = ec_excd_130 / sample_n,
fc_excd_31 = fc_excd_31 / sample_n,
fc_excd_400 = fc_excd_400 / sample_n) %>%
select(-no_indv_excd)
indv_excd_tbl_exp <- indv_excd_tbl %>%
select(location, data_type,year,sample_n,ec_excd_130,fc_excd_31,fc_excd_400) %>%
transform(year = as.factor(year),
sample_n = as.factor(sample_n)) %>%
adorn_pct_formatting()
# prep table for export
colnames(indv_excd_tbl_exp) <- c("Location","Bacteria","Year","Sample Count",
"% Enterococci Samples Exceeding 130 CFU/100 mL",
"% Fecal Coliform Samples Exceeding 31 CFU/100 mL",
"% Fecal Coliform Samples Exceeding 400 CFU/100 mL")
# save table in csv
write_csv(indv_excd_tbl_exp,"output/individual_sample_exceedences.csv")
# print table
datatable(indv_excd_tbl_exp,
filter = 'top',
options = list(pageLength = 5, autoWidth = TRUE))
Recall; regarding individual sample values and 30-day periods:
Enterococci:
Fecal coliform:
“… not more than 10% of the samples may exceed 31 CFU/100 mL for a membrane filtration test.”
“… not more than 10% of the samples may exceed 400 fecal coliform/100 mL.”
# calculate % of individual samples exceeding thresholds within 30-day periods
indv_excd_30day <- dat_excd %>%
# designate 30-day periods for each site/year
group_by(location,data_type,year) %>%
mutate(period_30day = rep(seq_len(n()/2), each = 4, length.out = n()),
sample_ct = n()) %>%
ungroup() %>%
# create columns of start/end dates for each 30-day period
group_by(location,data_type,year,period_30day) %>%
mutate(start_date = min(date),
end_date = max(date)) %>%
# count number of exceedences by type, site, year
group_by(location,data_type,year,sample_ct,period_30day,start_date,end_date) %>%
filter(!is.na(indv_excd)) %>%
count(indv_excd) %>%
# what % of individual samples exceeded thresholds during 30-day periods?
mutate(pct_indv_samples_excd = (n / sample_ct)) %>%
# remove instances of no individual sample exceedences
filter(indv_excd != "no_indv_excd")
# prep table for export
indv_excd_30day_exp <- indv_excd_30day %>%
# retain only instances where > 10% of individual samples exceeded thresholds
filter(pct_indv_samples_excd > .1) %>%
ungroup() %>%
dplyr::mutate_all(as.character) %>%
mutate(pct_indv_samples_excd = as.numeric(pct_indv_samples_excd)) %>%
# adorn with percentage format
adorn_pct_formatting() %>%
select(location,data_type,year,indv_excd,start_date,end_date,pct_indv_samples_excd) %>%
# replace exceedence type with longer names
mutate(indv_excd = str_replace_all(indv_excd,c("fc_excd_31" = "Fecal Coliform > 31",
"fc_excd_400" = "Fecal Coliform > 400",
"ec_excd_130" = "Enterococci > 130"))) %>%
spread(indv_excd,pct_indv_samples_excd) %>%
arrange(year,location,data_type)
# rename columns
colnames(indv_excd_30day_exp) <- c("Location","Bacteria","Year","Start Date","End Date",
"% Samples with Enterococci > 130 in 30-day period",
"% Samples with Fecal Coliform > 31 in 30-day period")
# save table in csv
write.csv(indv_excd_30day_exp,"output/30day_periods_individual_sample_exceedences.csv", row.names = F)
# print table
datatable(indv_excd_30day_exp,
filter = 'top',
options = list(pageLength = 5, autoWidth = TRUE))
indv_excd_tbl %>%
gather(excd, val, contains("excd")) %>%
ggplot(aes(as.factor(year),val,fill = excd)) +
geom_col(position = "dodge2") +
facet_grid(. ~ location, labeller = labeller(location = label_wrap_gen(5))) +
ylim(0,1) +
xlab("") +
ylab("% of Total Indvidual Samples\nExceeding Thresholds") +
theme_bw() +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
# modify legend text
scale_fill_discrete(name="Exceedence Type\n(CFU/100 mL)",
breaks = c("ec_excd_130","fc_excd_31","fc_excd_400"),
labels = c("Enterococci > 130",
"Fecal Coliform > 31",
"Fecal Coliform > 400")) +
col_theme
# save image
ggsave("output/individual_sample_exceedences.png",width = 10, height = 4)
indv_excd_30day %>%
ggplot(aes(period_30day,pct_indv_samples_excd,fill = indv_excd)) +
geom_col(position = position_dodge(preserve = "single")) +
facet_grid(year ~ location, labeller = labeller(location = label_wrap_gen(5))) +
ylim(0,.3) +
xlab("30-Day Sampling Period") +
ylab("% of Indvidual Samples\nExceeding Thresholds Within 30-Day Periods") +
theme_bw() +
geom_hline(yintercept = .1, linetype = "dashed") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
# modify legend text
scale_fill_discrete(name="Exceedence Type\n(CFU/100 mL)",
breaks = c("ec_excd_130","fc_excd_31","fc_excd_400"),
labels = c("Enterococci > 130",
"Fecal Coliform > 31",
"Fecal Coliform > 400")) +
col_theme
# save image
ggsave("output/30day_individual_sample_exceedences.png",width = 10, height = 4)
# plot, enterococci 30 day geometric means
dat_excd %>%
filter(data_type == "Enterococci") %>%
# note; use of year for date format is "fake" placeholder
ggplot(aes(x = as.Date(paste(2014,strftime(date,format="%m-%d"),sep="-")),
y = gm30,
color = as.factor(year))) +
geom_jitter(size = 2) +
facet_grid(. ~ location, scales = "free_y", labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("30-Day Geometric Mean Sample\nConcentration (CFU/100 mL)") +
ylim(0,250) +
scale_color_discrete(name = "Year") +
# 30 day geometric mean threshold
geom_hline(aes(yintercept = 35, linetype = "Raw seafood consumption\n(35 CFU/100 mL)"), color = "blue") +
scale_linetype_manual(name = "Threshold", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("blue")))) +
# overall theme
theme_bw() +
theme(axis.text.x = element_text(vjust = 1,hjust = 1, angle = 35),
strip.text.y = element_text(angle = 0)) +
ggtitle("Enterococci\n30-Day Geometric Mean Concentrations") +
points_theme
# save plot
ggsave("output/geomean_enterococci_conc_2019_2020.png", width = 10.5, height = 4)
# plot, fecal coliform 30 day geometric means
dat_excd %>%
filter(data_type == "Fecal Coliform") %>%
# note; use of year for date format is "fake" placeholder
ggplot(aes(x = as.Date(paste(2014,strftime(date,format="%m-%d"),sep="-")),
y = gm30,
color = as.factor(year))) +
geom_jitter(size = 2) +
facet_grid(. ~ location, scales = "free_y", labeller = labeller(location = label_wrap_gen(5))) +
xlab("") +
ylab("30-Day Geometric Mean Sample\nConcentration (CFU/100 mL)") +
scale_color_discrete(name = "Year") +
# 30 day geometric mean thresholds
geom_hline(aes(yintercept = 14, linetype = "Raw Seafood Consumption\n(14 CFU/100 mL)"), color = "blue") +
geom_hline(aes(yintercept = 200, linetype = "Water recreation, secondary\n(200 CFU/100 mL)"), color = "red") +
scale_linetype_manual(name = "Threshold", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("blue","red")))) +
# overall theme
theme_bw() +
theme(axis.text.x = element_text(vjust = 1,hjust = 1, angle = 35),
strip.text.y = element_text(angle = 0)) +
points_theme +
ggtitle("Fecal Coliform\n30-Day Geometric Mean of Concentrations")
# save plot
ggsave("output/geomean_fecal_coliform_conc_2019_2020.png", width = 10, height = 4)
Calculate % exceedence frequency by site, year, and bacteria type for 30-day rolling geometric mean
need to make text larger on all figures plot of 24 hr sampling event in 2019??
Is there a relationship between fecal coliform and enterococci concentrations?