# Selects a site of interest and a reference site for comparison
# Here South Skunk upstream "s10850003" and downstream "s10850002"
# Alternately, use "s10500001" for lower Indian Creek
site_a <- "10850003"
site_ref <- "10850002"
sites_comp <- c(site_a, site_ref)

# The "DNR" placeholders will stand in for any csv file downloaded from DNR's AQuIA
# Within a session, the DNR object can be used in other scripts.
DNR_input <- "data/siteSamplingData-10850002.csv"
DNR_output <- "data/output.csv"
# Document the input when you run variations: 
# downloaded: 2020-07-23
# sites: South Skunk ambient site 10850002
# analytes: all
# dates: all (most recent 2020-06-02)

library(tidyverse)
library(lubridate)
# import from csv.  Some of the error flags have letter codes
# They are rare enough that R can't guess type from the first 1000 rows, so we have to specify
DNR_raw <- read_csv(DNR_input, 
                col_types = cols(qualFlag = col_character(), siteID = col_character(),
                                 quantFlag = col_character(), quantLimit = col_double(), 
                                 remark = col_character(), 
                                 sampleDate = col_datetime(format = "%m/%d/%Y %H:%M:%S"))) %>%
  
  # Combine data from the upstream site
# downloaded: 2020-11-06
# sites: South Skunk ambient site 10850003
# analytes: all
# dates: all (monitored thru 2014)
  add_row(read_csv("data/siteSamplingData-10850003.csv", 
                col_types = cols(qualFlag = col_character(), siteID = col_character(),
                                 quantFlag = col_character(), quantLimit = col_double(), 
                                 remark = col_character(), 
                                 sampleDate = col_datetime(format = "%Y-%m-%dT%H:%M:%S%z"))))
# Corrects spelling errors and variations resulting in duplicate analytes
DNR_raw$analyte[DNR_raw$analyte=="Ammonia-nitrogen (AS N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia-nitrogen (as N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia (AS N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia"] <- "Ammonia (as NH3)"
DNR_raw$analyte[DNR_raw$analyte=="Inorganic nitrogen (nitrate and nitrite) (AS N)"] <- "Inorganic nitrogen (nitrate and nitrite) (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Orthophosphate (AS P)"] <- "Orthophosphate (as P)"
DNR_raw$analyte[DNR_raw$analyte=="Phosphate-phosphorus (AS P)"] <- "Phosphate-phosphorus (as P)"
DNR_raw$analyte[DNR_raw$analyte=="Kjeldahl nitrogen (AS N)"] <- "Kjeldahl nitrogen (as N)"
# Optionally, changes nitrate to inorganic nitrogen, since nitrite was negligible during that period
DNR_raw$analyte[DNR_raw$analyte=="Nitrate (as N)"] <- "Inorganic nitrogen (nitrate and nitrite) (as N)"


# Counts number of detects and nondetects, by analyte.
# Revised 2020-08-06. Sum works better than pivot.
DNR_analytes <- DNR_raw %>%
  group_by(analyte) %>%
  summarise(
    count_detected = sum(detect, na.rm = TRUE),
    count_samples = n()
  ) %>%
  arrange(desc(count_detected), desc(count_samples))
DNR_analytes
selected_analytes <- DNR_analytes %>%
  filter(count_detected > 60)
DNR <- DNR_raw %>%
  filter(analyte %in% selected_analytes$analyte)
  # Optional - Break apart sample date
DNR <- DNR %>% mutate(Date = as_date(sampleDate),
         Year = year(Date),
         Month = month(Date, label = TRUE),
         Week = week(Date),
         Day = wday(Date, label = TRUE))
# opt. add a letter code to site ID so it can be more easily pivoted
#  mutate(siteID = str_c("s", siteID))

# makes a list of matching periods, that can then be used to filter results
MatchingDates<- DNR %>%
  group_by(analyte, Date) %>%
  filter(site_a %in% siteID & site_ref %in% siteID) %>%
  ungroup() %>%
  select(Date) %>%
  distinct()

MatchingMonth<- DNR %>%
  group_by(analyte, Year, Month, Date) %>%
  filter(site_a %in% siteID & site_ref %in% siteID) %>%
  ungroup() %>%
  select(Date) %>%
  distinct()

Regular sampling at the South Skunk River upstream of Ames at Sleepy Hollow Access (US1) occurred from October 2000 thru Sept 2014, with a few small gaps. Sampling was done on the same dates as the DNR’s ambient monitoring site downstream of Ames, located just below the WWTP at 280th St (DS1). The downstream site has been monitored from October 1998 to the present.

filter(DNR, siteID %in% sites_comp) %>%
  ggplot(aes(x = Week, y = name, shape = Day)) +
  geom_point() +
  scale_x_continuous(minor_breaks = seq(0,52, 1)) +
  theme_light() +
  facet_wrap(~Year, ncol = 1) +
  labs(y = "Site", x = "Week of Year", 
       title = "Timing of sampling, 2020", subtitle = "DNR nutrient data")

When making comparisons between sites, it’s best to look at the same time period.

filter(DNR, siteID %in% sites_comp, analyte == "Phosphate-phosphorus (as P)") %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
  geom_line(aes(y =detectLimit)) +
    stat_summary(fun=mean, geom="point", shape=24, size=4, 
               aes(group = name), fill = "black", position = position_dodge2(1)) +
  labs(title = "Comparison of sites, all dates", y = "Total phosphorus (mg/L)") +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank())

filter(DNR, siteID %in% sites_comp, analyte == "Phosphate-phosphorus (as P)",
       as.Date(Date) %in% MatchingDates$Date) %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
    stat_summary(fun=mean, geom="point", shape=24, size=4, 
               aes(group = name), color = "black", position = position_dodge2(1)) +
  geom_line(aes(y =detectLimit)) +
  labs(title = "Comparison of sites, Oct 2000- Sept 2014", y = "Total phosphorus (mg/L)") +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank())

From Oct 2000-Sept 2014, we can compare a variety of water quality metrics in the South Skunk River upstream and downstream of Squaw Creek. Median chloride, chlorophyll a, E. coli, flow phosphorus (both orthophosphate and total phosphorus), nitrogen (both inorganic and kjehldahl nitrogen), sulfates and temperature are higher downstream. Median atrazine, hardness, pheophytin a are higher upstream. Triangles indicate the mean.

# Quick exploratory boxplots for all the selected analytes
filter(DNR, siteID %in% sites_comp,
       as.Date(Date) %in% MatchingDates$Date) %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
    stat_summary(fun=mean, geom="point", shape=24, size=3, 
               aes(group = name), color = "black", position = position_dodge2(1)) +
  geom_line(aes(y =detectLimit)) +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank()) +
  facet_wrap(~analyte, ncol = 5, scales = "free_y")