# Selects a site of interest and a reference site for comparison
# Here South Skunk downstream "s10850002" and lower Indian Creek  "s10500001"
site_a <- "10500001"
site_ref <- "10850002"
sites_comp <- c(site_a, site_ref)

# The "DNR" placeholders will stand in for any csv file downloaded from DNR's AQuIA
# Within a session, the DNR object can be used in other scripts.
DNR_input <- "data/siteSamplingData-10850002.csv"
DNR_output <- "data/output.csv"
# Document the input when you run variations: 
# downloaded: 2020-07-23
# sites: South Skunk ambient site 10850002
# analytes: all
# dates: all (most recent 2020-06-02)

library(tidyverse)
library(lubridate)
# import from csv.  Some of the error flags have letter codes
# They are rare enough that R can't guess type from the first 1000 rows, so we have to specify
DNR_raw <- read_csv(DNR_input, 
                col_types = cols(qualFlag = col_character(), siteID = col_character(),
                                 quantFlag = col_character(), quantLimit = col_double(), 
                                 remark = col_character(), 
                                 sampleDate = col_datetime(format = "%m/%d/%Y %H:%M:%S"))) %>%
  
  # Combine data from the upstream site
# downloaded: 2020-07-23
# sites: Indian Creek near Colfax site 10500001
# analytes: all
# dates: all (most recent 2020-06-02)
  add_row(read_csv("data/siteSamplingData-10500001.csv", 
                col_types = cols(qualFlag = col_character(), siteID = col_character(),
                                 quantFlag = col_character(), quantLimit = col_double(), 
                                 remark = col_character(), 
                                 sampleDate = col_datetime(format = "%Y-%m-%dT%H:%M:%S%z"))))
# Corrects spelling errors and variations resulting in duplicate analytes
DNR_raw$analyte[DNR_raw$analyte=="Ammonia-nitrogen (AS N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia-nitrogen (as N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia (AS N)"] <- "Ammonia (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Ammonia"] <- "Ammonia (as NH3)"
DNR_raw$analyte[DNR_raw$analyte=="Inorganic nitrogen (nitrate and nitrite) (AS N)"] <- "Inorganic nitrogen (nitrate and nitrite) (as N)"
DNR_raw$analyte[DNR_raw$analyte=="Orthophosphate (AS P)"] <- "Orthophosphate (as P)"
DNR_raw$analyte[DNR_raw$analyte=="Phosphate-phosphorus (AS P)"] <- "Phosphate-phosphorus (as P)"
DNR_raw$analyte[DNR_raw$analyte=="Kjeldahl nitrogen (AS N)"] <- "Kjeldahl nitrogen (as N)"
# Optionally, changes nitrate to inorganic nitrogen, since nitrite was negligible during that period
DNR_raw$analyte[DNR_raw$analyte=="Nitrate (as N)"] <- "Inorganic nitrogen (nitrate and nitrite) (as N)"


# Counts number of detects and nondetects, by analyte.
# Revised 2020-08-06. Sum works better than pivot.
DNR_analytes <- DNR_raw %>%
  group_by(analyte) %>%
  summarise(
    count_detected = sum(detect, na.rm = TRUE),
    count_samples = n()
  ) %>%
  arrange(desc(count_detected), desc(count_samples))
DNR_analytes
selected_analytes <- DNR_analytes %>%
  filter(count_detected > 60)
DNR <- DNR_raw %>%
  filter(analyte %in% selected_analytes$analyte)
  # Optional - Break apart sample date
DNR <- DNR %>% mutate(Date = as_date(sampleDate),
         Year = year(Date),
         Month = month(Date, label = TRUE),
         Week = week(Date),
         Day = wday(Date, label = TRUE))
# opt. add a letter code to site ID so it can be more easily pivoted
#  mutate(siteID = str_c("s", siteID))

# makes a list of matching periods, that can then be used to filter results
MatchingDates<- DNR %>%
  group_by(analyte, Date) %>%
  filter(site_a %in% siteID & site_ref %in% siteID) %>%
  ungroup() %>%
  select(Date) %>%
  distinct()

MatchingMonth<- DNR %>%
  group_by(analyte, Year, Month, Date) %>%
  filter(site_a %in% siteID & site_ref %in% siteID) %>%
  ungroup() %>%
  select(Date) %>%
  distinct()

Monitoring at Indian Creek started in October 1999, one year later than the South Skunk. Since 2005, Indian Creek and the South Skunk River have been sampled on the same day.

filter(DNR, siteID %in% sites_comp) %>%
  ggplot(aes(x = Week, y = name, shape = Day)) +
  geom_point() +
  scale_x_continuous(minor_breaks = seq(0,52, 1)) +
  theme_light() +
  facet_wrap(~Year, ncol = 1) +
  labs(y = "Site", x = "Week of Year", 
       title = "Timing of sampling, 2020", subtitle = "DNR nutrient data")

When making comparisons between sites, it’s best to look at the same time period. Here, excluding the one year and the few dates where samples were not coordinated makes little difference.

filter(DNR, siteID %in% sites_comp, analyte == "Phosphate-phosphorus (as P)") %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
  geom_line(aes(y =detectLimit)) +
    stat_summary(fun=mean, geom="point", shape=24, size=4, 
               aes(group = name), color = "black", position = position_dodge2(1)) +
  labs(title = "Comparison of sites, all dates", y = "Total phosphorus (mg/L)") +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank())

filter(DNR, siteID %in% sites_comp, analyte == "Phosphate-phosphorus (as P)",
       as.Date(Date) %in% MatchingDates$Date) %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
    stat_summary(fun=mean, geom="point", shape=24, size=4, 
               aes(group = name), color = "black", position = position_dodge2(1)) +
  geom_line(aes(y =detectLimit)) +
  labs(title = "Comparison of sites, matching dates", y = "Total phosphorus (mg/L)") +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank())

Here are multiple analytes, for the 1999-2020 period when both sites were sampled monthly. Median alkalinity, enterococcus, chlorophyll (maybe), pheophytin and TSS are higher in Indian Creek. Ammonia, calcium, carbon, chloride, fecal coliform, nitrogen (both types), phosphorus (both types), sulfate, and various dissolved solids are higher in the South Skunk River.

# Quick exploratory boxplots for all the selected analytes
filter(DNR, siteID %in% sites_comp,
       as.Date(Date) %in% MatchingMonth$Date) %>%
  ggplot(aes(x = name, y = result, color = name)) +
  geom_boxplot() +
    stat_summary(fun=mean, geom="point", shape=24, size=3, 
               aes(group = name), color = "black", position = position_dodge2(1)) +
  geom_line(aes(y =detectLimit)) +
   theme(legend.position = "bottom", axis.title.x = element_blank(), axis.text.x = element_blank(),
         axis.ticks.x = element_blank()) +
  facet_wrap(~analyte, ncol = 5, scales = "free_y")