Lake Auburn Long-Term Water Quality Data Analysis

Code written by Sarah Smith using data collected and provided by the Lake Auburn Watershed Protection Commission and Holly Ewing. Data analysis were used to support the community-engaged research conducted by Sarah Smith, Lili Missbrenner, and Jonah Yaffe for ENVR417.

The water quality data analysis conducted in this research was broken up by sampling type, since the data were collected at in-lake sampling sites and perimeter sampling sites. First, I started with analyzing/visualizing the perimeter data. There were several different data sets within the 2010-2025 time chunk, so I had to read in the data separately and clean the data frames before joining all of the data together. This process also involved filtering out sampling sites with high sample abundance. Then, I was able to calculate means and visualize the data in different ways. Many of the graphs included in this code were not used in the report but helped shape the figures that were included. Additionally, I attempted to hide some of the outputs of certain code chunks that would make this document difficult to read.

First, Library all of the necessary packages.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(tidyr)
library(lubridate)
library(performance)
library(patchwork)
library(ggplot2)
library(broom)
library(ggsci)
library(patchwork)
library(knitr)

Set working directory

setwd("~/Documents/ENVR417")

Load 2019-2024 data

PER_19_24 <- read.csv("AUB_PER_2019_2025.csv")
PER_19_24_clean_names <- clean_names(PER_19_24)
PER_19_24_clean_names

Make a new data frame with the selected columns with data we want

PER_19_24_clean <- PER_19_24_clean_names|>
  dplyr::select(year, day, date, time, location_site, turbidity_ntu, conductivity_u_s, tds_mg_l)

PER_19_24_clean
PER_19_24_cleaner <- PER_19_24_clean|>
  rename("location" = "location_site")
PER_19_24_cleaner

Make a histogram of the sites to see how many data points each site has

hist_late <-ggplot(data=PER_19_24_cleaner, aes(x=location))+
  geom_histogram(stat='count')+
  theme_bw()
Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
`binwidth` and `bins`
hist_late

Load 2013-2018

PER_13_18 <- read.csv("AUB_PER_2013_2018.csv")|>
  drop_na()
table(PER_13_18$Parameter)
PER_13_18$Parameter <- gsub("Conductivity uS ", "Conductivity uS", PER_13_18$Parameter)
table(PER_13_18$Parameter)

Pivot the data so that parameters are column headers and the “result.value” is underneath

PER_13_18_wider <- pivot_wider(data=PER_13_18, names_from = "Parameter", values_from = "Result.Value")
Warning: Values from `Result.Value` are not uniquely identified; output will contain
list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} |>
  dplyr::summarise(n = dplyr::n(), .by = c(Year, Day, Date, Time, Location,
  Parameter)) |>
  dplyr::filter(n > 1L)
head(PER_13_18_wider)
PER_13_18_wider_clean_names <- clean_names(PER_13_18_wider)
PER_13_18_wider_clean_names

Make a new data frame with only the columns we want

PER_13_18_clean <- PER_13_18_wider_clean_names|>
  dplyr::select(year, day, date, time, location, turbidity_ntu, conductivity_u_s, tds_mg_l)
PER_13_18_clean

Read in 2010-2012 data

PER_10_12 <- read.csv("AUB_PER_10_12.csv")
PER_10_12_clean_names <- clean_names(PER_10_12)
PER_10_12_clean_names

Make a new data frame with the selected columns with data we want

PER_10_12_clean <- PER_10_12_clean_names|>
  dplyr::select(year, day, date, time, location, turbidity_ntu, conductivity_u_s, tds_mg_l)
PER_19_24_clean

read in the 2025 data

PER_25 <- read.csv("AUB_PER_25.csv")
PER_25_clean_names <- clean_names(PER_25)
PER_25_clean_names

Make a new data frame with the selected columns with data we want

PER_25_clean <- PER_25_clean_names|>
  dplyr::select(year, day, date, time, location, turbidity_ntu, conductivity_u_s, tds_mg_l)
PER_25_clean

Join the data sets so that we have 2010-2025

PER_10_25 <- rbind(PER_10_12_clean, PER_13_18_clean, PER_19_24_cleaner, PER_25_clean)|>
  filter()
PER_10_25

Make another histogram/table that totals up the number of samples by site

PER_10_25$location=as.factor(PER_10_25$location)

PER_hist <-ggplot(data=PER_10_25, aes(x=location))+
  geom_histogram(stat='count')+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45))
Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
`binwidth` and `bins`
PER_hist

site_count_table<-PER_10_25 |>
  select(location, conductivity_u_s) |>
  group_by(location) |>
  dplyr::summarise(count=n())
site_count_table

site_count_table_2<-PER_10_25 |>
  select(location, turbidity_ntu) |>
  group_by(location) |>
  dplyr::summarise(count=n())
site_count_table_2

Filter out the site numbers with the greatest amount of data

PER_10_25$location<-as.factor(PER_10_25$location)


PER_10_25_new <- PER_10_25|>
  filter(location==("2") | location==("16") | location==("26") | location==("25") |location==("13") | location==("27") | location==("3") | location==("18") | location==("17") | location==("TBR") | location==("1") | location==("4") | location==("23") | location==("R-2") | location==("5A") | location==("B-1"))
PER_10_25_new

create a new date column with a better formatted/usable date

PER_10_25_new <- PER_10_25_new|>
  mutate(date_new = mdy(date))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `date_new = mdy(date)`.
Caused by warning:
!  3 failed to parse.

make a new data frame with just conductivity, drop any rows with NULL values

PER_10_25_COND <- PER_10_25_new|>
  dplyr::select(year, day, date, time, location, conductivity_u_s, date_new)|>
  drop_na()
PER_10_25_COND

change the data to be numeric

PER_10_25_COND$conductivity_u_s <- as.numeric(PER_10_25_COND$conductivity_u_s)

Make a conductivity scatterplot to help visualize the data

PER_cond <- ggplot(data=PER_10_25_COND, aes(x=date_new, y=conductivity_u_s))+
  geom_point()+
  labs(x='Date of Sample', y='Conductivity (uS)', title='Conductivity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_classic()
PER_cond
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

make a table to find the outliers

cond_table<-PER_10_25_COND |>
  select(date, location, conductivity_u_s) |>
  group_by(location)
cond_table

make a new data frame with averages grouped by year and site location

avg_PER_COND <- PER_10_25_COND|>
  group_by(year, location)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
mean_PER_cond <- ggplot(data=avg_PER_COND, aes(x=year, y=meanCOND, color=location))+
  geom_point()+
  geom_errorbar(data=avg_PER_COND, aes(x=year, ymin=meanCOND-se, ymax=meanCOND+se), width=.2)+
  labs(x='Year', y='Mean Conductivity (uS)', title='Mean Conductivity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_classic()
mean_PER_cond
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

attempt a new graph with just means by year

avg_PER_COND_2 <- PER_10_25_COND|>
  group_by(year)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))
mean_PER_cond_2 <- ggplot(data=avg_PER_COND_2, aes(x=year, y=meanCOND))+
  geom_point()+
  geom_errorbar(data=avg_PER_COND_2, aes(x=year, ymin=meanCOND-se, ymax=meanCOND+se), width=.2)+
  #geom_jitter(data=PER_10_25_COND_filtered, aes(x=year, y=conductivity_u_s), col='orange')+
  labs(x='Year', y='Mean Conductivity (uS/cm)', title='Mean Conductivity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_classic()
mean_PER_cond_2
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

make a graph with means by location

avg_PER_COND_3 <- PER_10_25_COND|>
  drop_na()|>
  group_by(location)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))
mean_PER_cond_3 <- ggplot(data=avg_PER_COND_3, aes(x=location, y=meanCOND))+
  geom_jitter(data=PER_10_25_COND, size=.5, width=.4, aes(x=location, y=conductivity_u_s), col= 'orange',alpha= .5)+
  geom_point(col="darkblue")+
  geom_errorbar(data=avg_PER_COND_3, aes(x=location, ymin=meanCOND-se, ymax=meanCOND+se), width=.2, col="darkblue")+
  labs(x='Site', y='Mean Conductivity (uS/cm)', title='Mean Conductivity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))
mean_PER_cond_3
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

now make a boxplot for the data organized by site to help visualize where the outliers are

PER_cond_boxplot <- ggplot(data=PER_10_25_COND, aes(x=location, y=conductivity_u_s))+
  geom_boxplot()+
  labs(x='Sampling Site', y='Conductivity (uS)', title='Conductivity of Perimeter Sampling Sites in Lake Auburn from 2010-2025')+
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5))
PER_cond_boxplot
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_boxplot()`).

To help with visualizing conductivity and chloride loading trends across space and over time, make two new data frames, one with data from 2014-2019 and one with data from 2020-2025

make a new data frame that filters out any conductivity measurements above 800 as this is unlikely to be accurate

PER_10_25_COND_filtered <- PER_10_25_COND|>
  filter(conductivity_u_s <= 800)
PER_10_25_COND_filtered$year <- as.factor(PER_10_25_COND_filtered$year)

per_cond_14_19 <- PER_10_25_COND_filtered|>
  filter(year =="2014"| year =="2015"| year =="2016" | year =="2017"| year=='2018' | year =="2019")

per_cond_20_25 <- PER_10_25_COND_filtered|>
  filter(year =="2020"| year =="2021"| year =="2022" | year =="2023"| year=='2024' | year =="2025")

now calculate mean conductvity by site for those chunks of years

avg_per_cond_14_19 <- per_cond_14_19|>
  group_by(location)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))

avg_per_cond_20_25 <- per_cond_20_25|>
  group_by(location)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))

make a graph with each of the year chunks plotted for each location

mean_per_cond14_19_20_25 <- ggplot(data=avg_per_cond_14_19, aes(x=location, y=meanCOND))+
  geom_point(size=2)+
  geom_errorbar(data=avg_per_cond_14_19, aes(x=location, ymin=meanCOND-se, ymax=meanCOND+se), width=.3)+
  geom_point(data=avg_per_cond_20_25, aes(x=location, y=meanCOND), size=2)+
  geom_errorbar(data=avg_per_cond_20_25, aes(x=location, ymin=meanCOND-se, ymax=meanCOND+se), width=.3)+
  labs(x='Site', y='Mean Conductivity (uS/cm)', title='Mean Conductivity of Perimeter Samples in Lake Auburn from 2014-2019 and 2020-2025')+
  theme_bw()+
  scale_color_bmj()+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.title = element_text(size=18))
mean_per_cond14_19_20_25

mutate a new column with the equation for the chloride conversion in the conductivity data frame for each time chunk, this will be in mEq so convert it to mg/L by multiplying by the atomic weight

avg_per_cond_20_25 <- avg_per_cond_20_25|>
  mutate(meanCHLOR = ((avg_per_cond_20_25$meanCOND*0.067)-.0796))
avg_per_cond_20_25 <- avg_per_cond_20_25|>
  mutate(meanCHLOR_mgL = (avg_per_cond_20_25$meanCHLOR*35.45))
avg_per_cond_14_19 <- avg_per_cond_14_19|>
  mutate(meanCHLOR = ((avg_per_cond_14_19$meanCOND*0.067)-.0796))
avg_per_cond_14_19 <- avg_per_cond_14_19|>
  mutate(meanCHLOR_mgL = (avg_per_cond_14_19$meanCHLOR*35.45))

now read in the stream data, get a mean discharge by site for the same year chunks

discharge_14_25 <- read.csv("discharge_15_25.csv")

filter for only the columns we need: year, Site 2, and Sum(m3_psec)

discharge_14_25_clean <- clean_names(discharge_14_25)
discharge_14_25_NEW <- discharge_14_25_clean[,c(2,3,6,9)]|>
  drop_na()

now make 2 data frames, one with flow data from 2014-2019 and 2020-2025

discharge_14_25_NEW$year <- as.factor(discharge_14_25_NEW$year)

discharge_14_19 <- discharge_14_25_NEW|>
  filter(year =="2014"| year =="2015"| year =="2016" | year =="2017"| year=='2018' | year =="2019")

discharge_20_25 <- discharge_14_25_NEW|>
  filter(year =="2020"| year =="2021"| year =="2022" | year =="2023"| year=='2024' | year =="2025")

now get averages by site for each time chunk

avg_discharge_14_19 <- discharge_14_19|>
  group_by(site)|>
  summarize(meanDIS = mean(sum_m3_psec),
            sd = sd(sum_m3_psec),
            n=n(),
            se = (sd/sqrt(n)))

avg_discharge_20_25 <- discharge_20_25|>
  group_by(site)|>
  summarize(meanDIS = mean(sum_m3_psec),
            sd = sd(sum_m3_psec),
            n=n(),
            se = (sd/sqrt(n)))
discharge_14_19$sum_m3_psec <- as.numeric(discharge_14_19$sum_m3_psec)

instead of avg flow, because this number is likely a vast over estimation, try for median flow to use in the estimated chloride loads

med_discharge_14_19 <- discharge_14_19|>
  group_by(site)|>
  summarise(median_discharge = median(sum_m3_psec, na.rm = TRUE))

med_discharge_20_25 <- discharge_20_25|>
  group_by(site)|>
  summarise(median_discharge = median(sum_m3_psec, na.rm = TRUE))

add a new column to convert m3/sec to L/sec 9the flow data)

avg_discharge_14_19 <- avg_discharge_14_19|>
  mutate(meanDIS_LperSEC = (avg_discharge_14_19$meanDIS*1000))
avg_discharge_20_25 <- avg_discharge_20_25|>
  mutate(meanDIS_LperSEC = (avg_discharge_20_25$meanDIS*1000))

add a new column to convert m3/sec to L/sec in the median data set also

med_discharge_14_19$median_discharge <- as.numeric(med_discharge_14_19$median_discharge)
med_discharge_14_19 <- med_discharge_14_19|>
  mutate(median_discharge_LperSEC = (med_discharge_14_19$median_discharge*1000))
med_discharge_20_25$median_discharge <- as.numeric(med_discharge_20_25$median_discharge)
med_discharge_20_25 <- med_discharge_20_25|>
  mutate(median_discharge_LperSEC = (med_discharge_20_25$median_discharge*1000))

now select for only the sites we have avg conductivity for

avg_discharge_20_25$site <- as.factor(avg_discharge_20_25$site)

avg_discharge_20_25_new <- avg_discharge_20_25|>
  filter(site == "13" | site == "16" | site == "17" | site == "18" | site == "2" | site == "23" | site == "25" | site == "26" | site == "27" | site == "3" | site == "4" | site == "5A" | site == "B-1" | site == "R-2" | site == "TBR")
avg_discharge_14_19$site <- as.factor(avg_discharge_14_19$site)

avg_discharge_14_19_new <- avg_discharge_14_19|>
  filter(site == "13" | site == "16" | site == "17" | site == "18" | site == "2" | site == "23" | site == "25" | site == "26" | site == "27" | site == "3" | site == "4" | site == "5A" | site == "B-1" | site == "R-2" | site == "TBR")

now filter the median data sets the same way

med_discharge_20_25$site <- as.factor(med_discharge_20_25$site)

med_discharge_20_25_new <- med_discharge_20_25|>
  filter(site == "13" | site == "16" | site == "17" | site == "18" | site == "2" | site == "23" | site == "25" | site == "26" | site == "27" | site == "3" | site == "4" | site == "5A" | site == "B-1" | site == "R-2" | site == "TBR")
med_discharge_14_19$site <- as.factor(med_discharge_14_19$site)

med_discharge_14_19_new <- med_discharge_14_19|>
  filter(site == "13" | site == "16" | site == "17" | site == "18" | site == "2" | site == "23" | site == "25" | site == "26" | site == "27" | site == "3" | site == "4" | site == "5A" | site == "B-1" | site == "R-2" | site == "TBR")

filter to get rid of site 1 in the other data set

avg_per_cond_14_19_new <- avg_per_cond_14_19|>
  filter(location!=1 | location!=16 | location!=26 | location!="TBR")
avg_per_cond_20_25$location <- as.factor(avg_per_cond_20_25$location)

avg_per_cond_20_25_new <- avg_per_cond_20_25|>
  filter(location == "13" | location == "17" | location == "18" | location == "2" | location == "23" | location == "25" | location == "27" | location == "3" | location == "4" | location == "5A" | location == "B-1" | location == "R-2")

change the name of location to site

colnames(avg_per_cond_14_19_new)[1] <- "site"
colnames(avg_per_cond_20_25)[1] <- "site"

now add the avg and median discharge in the next column by site, this places all of the calculations in the same data set the column names are important to be able to distinguish between the data and different units, etc.!

avg_load_14_19 <- merge(avg_per_cond_14_19_new, avg_discharge_14_19_new, by = c("site" = "site"))
avg_load_20_25 <- merge(avg_per_cond_20_25, avg_discharge_20_25_new, by = c("site" = "site"))
avg_load_14_19 <- merge(avg_load_14_19, med_discharge_14_19_new, by = c("site" = "site"))
avg_load_20_25 <- merge(avg_load_20_25, med_discharge_20_25_new, by = c("site" = "site"))

now add a column that multiplies chloride concentration by discharge to get load in mg/sec

avg_load_14_19 <-avg_load_14_19|>
  mutate(load_mgperSEC = avg_load_14_19$meanCHLOR_mgL*avg_load_14_19$meanDIS_LperSEC)
avg_load_20_25 <-avg_load_20_25|>
  mutate(load_mgperSEC = avg_load_20_25$meanCHLOR_mgL*avg_load_20_25$meanDIS_LperSEC)

now add another column that multiplies chloride concentration by the median

avg_load_14_19 <-avg_load_14_19|>
  mutate(load_mgperSEC_new = avg_load_14_19$meanCHLOR_mgL*avg_load_14_19$median_discharge_LperSEC)
avg_load_20_25 <-avg_load_20_25|>
  mutate(load_mgperSEC_new = avg_load_20_25$meanCHLOR_mgL*avg_load_14_19$median_discharge_LperSEC)

now add another colum that converts the mg/sec into mg/year by multiplying by 60, 1440, 365

avg_load_14_19 <-avg_load_14_19|>
  mutate(load_mgperYEAR = avg_load_14_19$load_mgperSEC*60*1440*300)|>
  mutate(years = "2014-2019")
avg_load_20_25 <-avg_load_20_25|>
  mutate(load_mgperYEAR = avg_load_20_25$load_mgperSEC*60*1440*300)|>
  mutate(years = "2020-2025")

do this again for the loads calculated using median flow and making sure to only multiply by 250 (b/c the streams don’t flow for 365 days a year)

avg_load_14_19 <-avg_load_14_19|>
  mutate(load_mgperYEAR_new = avg_load_14_19$load_mgperSEC_new*60*1440*250)
avg_load_20_25 <-avg_load_20_25|>
  mutate(load_mgperYEAR_new = avg_load_20_25$load_mgperSEC_new*60*1440*250)

stack the two data frames with rbind so that it is easier to graph and group by year

avg_load_14_25 <- rbind(avg_load_14_19, avg_load_20_25)

convert into kg/year and tons/year to aid with visualization later

avg_load_14_25 <- avg_load_14_25|>
  mutate(load_tonsperYEAR = avg_load_14_25$load_mgperYEAR/9.072e+8)|>
  mutate(load_kgperYEAR_new = avg_load_14_25$load_mgperYEAR_new/1000000)|>
  mutate(load_tonsperYEAR_new = avg_load_14_25$load_mgperYEAR_new/9.072e+8)

try to make a graph with each site and the total load for that year

load_graph <- ggplot(data = avg_load_14_25, aes(x=site, y=load_mgperYEAR, fill =years))+
  geom_col(position = 'dodge')+
  coord_flip()+
  theme_bw()+
  labs(x='Site', y= "Estimated Average Annual Chloride Input (mg/year)")+
  scale_fill_bmj()+
  theme(axis.title = element_text(size=12))
load_graph

make another graph with the loads estimated using the median flow and fewer days of flow per year

load_graph2 <- ggplot(data = avg_load_14_25, aes(x=site, y=load_tonsperYEAR_new, fill =years))+
  geom_col(position = 'dodge')+
  coord_flip()+
  theme_bw()+
  labs(x='Site', y= "Estimated Average Annual Chloride Input (tons/year)")+
  scale_fill_bmj()+
  theme(axis.title = element_text(size=15))
load_graph2

now make a graph that shows the median flows so that it is easier to see how flow differs based on site

flow_graph <- ggplot(data=avg_load_14_25, aes(x=site, y=median_discharge_LperSEC, col =years))+
  geom_point(size=3, position = position_dodge(width=.2))+
  theme_bw()+
  scale_color_bmj()+
  labs(x='Site', y= "Median Flow (L/sec)")+
  theme(axis.title = element_text(size=15))
flow_graph

to get the most recent measurements for perimeter conductivity and turbidity, make a new data frame with just data from 2025

per_cond_25 <- PER_10_25_COND_filtered|>
  filter(year ==2025)

calculate mean perimeter conductivity by location for 2025

avg_per_cond_25 <- per_cond_25|>
  group_by(location)|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))

also, calculate the overall mean conductivity for the year

avg_per_cond25_1 <- per_cond_25|>
  summarize(meanCOND = mean(conductivity_u_s),
            sd = sd(conductivity_u_s),
            n=n(),
            se = (sd/sqrt(n)))

graph all of these values

mean_per_cond25 <- ggplot(data=avg_per_cond_25, aes(x=location, y=meanCOND))+
  geom_jitter(data=per_cond_25, size=.75, width=.4, aes(x=location, y=conductivity_u_s), alpha= 1, col='orange', size=2)+
  geom_point(col='darkblue', size =2)+
  geom_errorbar(data=avg_per_cond_25, aes(x=location, ymin=meanCOND-se, ymax=meanCOND+se), width=.5, col= 'darkblue')+
  geom_point(data=avg_per_cond25_1, aes(x= 7.5, y=meanCOND), col='red', size = 2)+
  geom_errorbar(data=avg_per_cond25_1, aes(x=7.5, ymin=meanCOND-se, ymax=meanCOND+se), width=.5, col= 'red')+
  labs(x='Site', y='Mean Conductivity (uS/cm)', title='Mean Conductivity of Perimeter Samples in Lake Auburn in 2025')+
  theme_bw()+
  theme(axis.title = element_text(size=18))
Warning: Duplicated aesthetics after name standardisation: size
mean_per_cond25

make a new data frame with just turbidity (across the whole time span), drop any rows with NULL values and filter any turbidity measurements over 60 because this is almost impossible

PER_10_25_TURB <- PER_10_25_new|>
  dplyr::select(year, day, date, time, location, turbidity_ntu, date_new)|>
  drop_na()|>
  filter(turbidity_ntu <= 60)
PER_10_25_TURB

change the data to be numeric

PER_10_25_TURB$turbidity_ntu <- as.numeric(PER_10_25_TURB$turbidity_ntu)

make a turbidity scatterplot to visualize the data

PER_turb <- ggplot(data=PER_10_25_TURB, aes(x=date_new, y=turbidity_ntu))+
  #geom_point()+
  geom_smooth()+
  labs(x='Date of Sample', y='Turbidity (NTU)', title='Turbidity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_classic()
PER_turb
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

attempt a graph with turbidity means by year

avg_PER_TURB <- PER_10_25_TURB|>
  group_by(year)|>
  summarize(meanTURB = mean(turbidity_ntu),
            sd = sd(turbidity_ntu),
            n=n(),
            se = (sd/sqrt(n)))
mean_PER_turb <- ggplot(data=avg_PER_TURB, aes(x=year, y=meanTURB))+
  geom_point()+
  geom_errorbar(data=avg_PER_TURB, aes(x=year, ymin=meanTURB-se, ymax=meanTURB+se), width=.2)+
  labs(x='Year', y='Mean Turbidity (NTU)', title='Mean Turbidity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_classic()
mean_PER_turb

try making a linear model of turbidity over time

PER_10_25_TURB$date2 <- mdy(PER_10_25_TURB$date)
lm_per_turb <- lm(turbidity_ntu ~ date2, data = PER_10_25_TURB)
summary(lm_per_turb)

lm_per_turb|>
  augment()|>
  ggplot(aes(x=date2, y=turbidity_ntu))+
  geom_point()+
  geom_line(aes(y=.fitted), lwd=1.25, col='red')+
  theme_classic()+
  labs(x='Date', y='Turbidity (NTU)')+
  theme(axis.title = element_text(size=16))

make a new data frame with just data from 2025, the calculate means by site and plot the means with error bars and raw data

per_turb_25 <- PER_10_25_TURB|>
  filter(year == 2025)
per_turb_25$location <- as.factor(per_turb_25$location)
turb_table25 <- per_turb_25|>
  dplyr::select(location, turbidity_ntu)|>
  group_by(location) |>
  dplyr::summarise(count=n())
turb_table25

make a table to see how many samples are at each site

cond_table25<-per_cond_25 |>
  dplyr::select(location, conductivity_u_s) |>
  group_by(location)|>
  dplyr::summarise(count=n())
cond_table25

calculate and graph average turbidty by site and overall for the year of 2025

avg_per_turb_25 <- per_turb_25|>
  group_by(location)|>
  summarize(meanturb = mean(turbidity_ntu),
            sd = sd(turbidity_ntu),
            n=n(),
            se = (sd/sqrt(n)))
avg_per_turb_25_new <- per_turb_25|>
  summarize(meanturb = mean(turbidity_ntu),
            sd = sd(turbidity_ntu),
            n=n(),
            se = (sd/sqrt(n)))
mean_per_turb25 <- ggplot(data=avg_per_turb_25, aes(x=location, y=meanturb))+
  geom_jitter(data=per_turb_25, size=1.2, aes(x=location, y=turbidity_ntu), alpha= 1, col='orange')+
  geom_point(col='darkblue', size=2)+
  geom_errorbar(data=avg_per_turb_25, aes(x=location, ymin=meanturb-se, ymax=meanturb+se), width=.5, col= 'darkblue')+
  geom_point(data=avg_per_turb_25_new, aes(x=7, y=meanturb), col="red", size=2)+
  geom_errorbar(data =avg_per_turb_25_new, aes(x=7, ymin=meanturb-se, ymax=meanturb+se), width=.5, col='red')+
  labs(x='Site', y='Mean Turbidity (NTU)', title='Mean Turbidity of Perimeter Samples in Lake Auburn in 2025')+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.title = element_text(size=18))
mean_per_turb25

attempt a new graph with just turbidity means by site location (excluding readings >60)

avg_PER_TURB_filtered <- PER_10_25_TURB|>
  group_by(location)|>
  summarize(meanTURB = mean(turbidity_ntu),
            sd = sd(turbidity_ntu),
            n=n(),
            se = (sd/sqrt(n)))
mean_PER_turb_filtered <- ggplot(data=avg_PER_TURB_filtered, aes(x=location, y=meanTURB))+
  geom_jitter(data=PER_10_25_TURB, size=.5, width=.4, aes(x=location, y=turbidity_ntu), alpha= .65, col='orange')+
  geom_point(col='darkblue', size=2)+
  geom_errorbar(data=avg_PER_TURB_filtered, aes(x= location, ymin=meanTURB-se, ymax=meanTURB+se), width=.5, col='darkblue')+
  labs(x='Site', y='Mean Turbidity (NTU)', title='Mean Turbidity of Perimeter Samples in Lake Auburn from 2010-2025')+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.title = element_text(size=18))
mean_PER_turb_filtered

Next, analyze the in-lake data. These data also had to be combined across several different data sets in various formats. There was a large data set from 2010-2022, but the most recent years had to be added.

Again, Library all of the necessary packages

library(tidyverse)
library(dplyr)
library(broom)
library(janitor)
library(tidyr)
library(lubridate)
library(performance)
library(patchwork)
library(ggplot2)
library(ggsci)
library(patchwork)
library(segmented)
Warning: package 'segmented' was built under R version 4.4.3
Loading required package: MASS

Attaching package: 'MASS'
The following object is masked from 'package:patchwork':

    area
The following object is masked from 'package:dplyr':

    select
Loading required package: nlme

Attaching package: 'nlme'
The following object is masked from 'package:dplyr':

    collapse

Set working directory

setwd("~/Documents/ENVR417")

read in the in-lake data

in_lake <- read.csv("In_Lake_10_25.csv")

pivot the data so that parameters are column headers and the “Result” is underneath

in_lake_wider <- pivot_wider(data=in_lake, names_from = "Parameter", values_from = "Result")
Warning: Values from `Result` are not uniquely identified; output will contain
list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} |>
  dplyr::summarise(n = dplyr::n(), .by = c(SAMPLE_DATE, SAMPLE_TIME, StationID,
  SAMPLE_DEPTH, CORE, Comments, Flag, Type, Source, SAMPLE_LOCATION, Units,
  Parameter)) |>
  dplyr::filter(n > 1L)
in_lake_wider
in_lake_wider_clean_names <- clean_names(in_lake_wider)
in_lake_wider_clean_names

add in the data from 2023, 2024, and 2025 (these need to be edited so that they can be joined with the previous data set)

in_lake_23 <- read.csv("in_lake_23_2.csv")
in_lake_23$Result <- as.numeric(in_lake_23$Result)
in_lake_23$Parameter <- as.factor(in_lake_23$Parameter)
in_lake_23_new <- in_lake_23|>
  filter(Parameter != "")
in_lake_23_wide <- pivot_wider(data=in_lake_23_new, names_from = "Parameter", values_from = "Result")
Warning: Values from `Result` are not uniquely identified; output will contain
list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} |>
  dplyr::summarise(n = dplyr::n(), .by = c(Year, Day, Date, Time, Location,
  Station_Number, Location_Name, Depth..m., Core.Depth..m., Units, RL, MDL,
  Flag, Comments, Lab, Equipment.used, Serial.Number, Parameter)) |>
  dplyr::filter(n > 1L)
in_lake_23_wide
in_lake_23_clean <- clean_names(in_lake_23_wide)|>
  dplyr::select(year, date, time, location, station_number, depth_m, conductivity, tds)

do the same things for the 2024 data set

in_lake_24 <- read.csv("in_lake_24.csv")
in_lake_24$X <- as.numeric(in_lake_24$X)
Warning: NAs introduced by coercion
in_lake_24$Parameter <- as.factor(in_lake_24$Parameter)
in_lake_24_new <- in_lake_24|>
  filter(Parameter != "")
in_lake_24_wide <- pivot_wider(data=in_lake_24_new, names_from = "Parameter", values_from = "X")
in_lake_24_wide
in_lake_24_clean <- clean_names(in_lake_24_wide)|>
  dplyr::select(year, date, time, location, station_number, depth_m, conductivity, tds)

do the same things for the 2025 data set

in_lake_25 <- read.csv("in_lake_25.csv")
in_lake_25$X <- as.numeric(in_lake_25$X)
Warning: NAs introduced by coercion
in_lake_25$Parameter <- as.factor(in_lake_25$Parameter)
in_lake_25_new <- in_lake_25|>
  filter(Parameter != "")
in_lake_25_wide <- pivot_wider(data=in_lake_25_new, names_from = "Parameter", values_from = "X")
in_lake_25_wide
in_lake_25_clean <- clean_names(in_lake_25_wide)|>
  dplyr::select(year, date, time, location, station_number, depth_m, conductivity, tds)

now stack all of the 2023-2025 data together

in_lake_23_25 <- rbind(in_lake_23_clean, in_lake_24_clean, in_lake_25_clean)|>
  filter()
in_lake_23_25

before combining all of the data, the 2010-2022 data set needs to be edited further make year column for 2010-2022

in_lake_10_22 <- in_lake_wider_clean_names|>
  mutate(date_new = mdy(sample_date))|>
  mutate(year = year(date_new))
in_lake_10_22

select same columns as 23-25

in_lake_10_22_new <- in_lake_10_22|>
  dplyr::select(year, sample_date, sample_time, station_id, sample_depth, sample_location, tds,conductivity)

rename the columns in 2010-2022

colnames(in_lake_10_22_new)[2] <- "date"
colnames(in_lake_10_22_new)[3] <- "time"
colnames(in_lake_10_22_new)[4] <- "station_number"
colnames(in_lake_10_22_new)[5] <- "depth_m"
colnames(in_lake_10_22_new)[6] <- "location"

now, join all of the data together so that it spands 2010-2025!

in_lake_10_25 <- rbind(in_lake_10_22_new, in_lake_23_25)

now check the tables for number of slamples at each depth and each site

depth_count_in_lake_table2<-in_lake_10_25 |>
  dplyr::select(depth_m, conductivity) |>
  group_by(depth_m) |>
  dplyr::summarise(count=n())
depth_count_in_lake_table2

site_count_nofilter_in_lake2 <- in_lake_10_25 |>
  dplyr::select(station_number, conductivity) |>
  group_by(station_number) |>
  dplyr::summarise(count=n())
site_count_nofilter_in_lake2

filter for only those depths with high sampling abundance

in_lake_10_25$depth_m<-as.factor(in_lake_10_25$depth_m)

in_lake_10_25new <- in_lake_10_25|>
  filter(depth_m==("0") | depth_m==("4") | depth_m==("9") | depth_m==("2") | depth_m==("3") | depth_m==("5") | depth_m==("1") | depth_m==("8") | depth_m==("7") | depth_m==("6") | depth_m==("10") | depth_m==("12") | depth_m==("16"))
in_lake_10_25new

check frequency of samples once specific depths are filtered out

site_count_depthfiltered_in_lake <- in_lake_10_25new |>
  dplyr::select(station_number, conductivity) |>
  group_by(station_number) |>
  dplyr::summarise(count=n())
site_count_depthfiltered_in_lake

now filter for only the sampling locations with the highest sampling abudance

in_lake_10_25new$station_number<-as.factor(in_lake_10_25new$station_number)

in_lake_10_25newer <- in_lake_10_25new|>
  filter(station_number==("8") | station_number==("12") | station_number==("32") | station_number==("30") | station_number==("31") | station_number==("29") | station_number==("9") | station_number==("97") | station_number==("7") | station_number==("11") | station_number==("10") | station_number==("6"))
in_lake_10_25newer

now try making a few graphs across the whole time span to start visualizing the data change the data to be numeric

in_lake_10_25newer$conductivity <- as.character(in_lake_10_25newer$conductivity)

in_lake_10_25newer$conductivity <- as.numeric(in_lake_10_25newer$conductivity)
Warning: NAs introduced by coercion

filter out outliers since these measurements are likely inaccurate and don’t necessarily help tell the story of the lake overall

in_lake_10_25newer <- in_lake_10_25newer|>
  filter(conductivity <= 500)|>
  filter(conductivity >= 30)
in_lake_cond_plot2 <- ggplot(data=in_lake_10_25newer, aes(x=date, y=conductivity))+
  geom_point()+
  theme_classic()
in_lake_cond_plot2

in_lake_turb_plot <- ggplot(data=in_lake_10_25newer, aes(x=date, y=turbidity))

now calculate means by year

mean_in_lake_cond_10_25 <- in_lake_10_25newer|>
  na.omit()|>
  group_by(year)|>
  summarize(meancond = mean(conductivity),
          sd = sd(conductivity),
          n=n(),
          se = (sd/sqrt(n)))
mean_in_lake_cond_10_25

graph means with error bars by year

mean_in_lake_cond_10_25graph <- ggplot(data=mean_in_lake_cond_10_25, aes(x=year, y=meancond))+
  geom_jitter(data=in_lake_10_25newer, size=.3, width=.4, aes(x=year, y=conductivity), alpha= .4, col='orange')+
  geom_point(size=2)+
  #geom_smooth()+
   geom_errorbar(data=mean_in_lake_cond_10_25, aes(x=year, ymin=meancond-se, ymax=meancond+se), width=.5, col='darkblue')+
  labs(x='Year', y='Mean Conductivity (uS/cm)', title='Mean Conductivity of In-Lake Samples in Lake Auburn from 2010-2025')+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.title = element_text(size = 20))
mean_in_lake_cond_10_25graph

attempting a linear regression

lm_1 <- lm(conductivity ~ year, data = in_lake_10_25newer)
summary(lm_1)
plot(conductivity ~ year, data = in_lake_10_25newer)
abline(lm_1, col="blue")

in_lake_10_25newer$date2 <- mdy(in_lake_10_25newer$date)

now try by the linear regression by date instead of year and try using the segmented package

lm_2 <- lm(conductivity ~ date2, data = in_lake_10_25newer)
summary(lm_2)

segs<-segmented(lm_2, seg.Z=~date2, npsi=2)
summary(segs) #this provided me with an R squared value 

augseg<-augment(segs)
Warning: The `augment()` method for objects of class `segmented` is not maintained by the broom team, and is only supported through the `lm` tidier method. Please be cautious in interpreting and reporting broom output.

This warning is displayed once per session.
augseg

ggplot(data=augseg, aes(x=date2, y=conductivity))+
  geom_point()+
  geom_line(aes(y=.fitted), col='red', linewidth=1.5)+
  labs(x='Date', y="Conductivity (uS/cm)")+
  theme_classic()+
  theme(axis.title = element_text(size=20))

# do the davies test to see if the slopes differ from one another?
davies.test(segs, k=10)

#estimation of breakpoint locations
confint(segs)

intercept(segs)
slope(segs)
summary(segs)

attempt another linear regression using the new date column, however the segmented linear regression is likely more helpful to see the changes in trends over time

lm_2|>
  augment()|>
  ggplot(aes(x=date2, y=conductivity))+
  geom_point()+
  geom_line(aes(y=.fitted), width=10)+
  theme_classic()+
  labs(x="Date", y="Conductivity (uS/cm)")+
  theme(axis.title = element_text(size=16))
Warning in geom_line(aes(y = .fitted), width = 10): Ignoring unknown
parameters: `width`

now look at the in-lake turbidity data. There were very few turbidity data points included in the in-lake data sets that we were given, so after trying to calculate mean turbidity by year and realizing that there was only a few years with turbidity data, this analysis mainly just focused on getting a turbidity graph for 2025.

start by reading in the turbidity data for 2025 (this was part of the 2025 Watershed Sampling Results that we were given)

turb_25 <- read.csv("turb_25_2.csv")|>
  clean_names()
turb_25_clean <- turb_25|>
  dplyr::select(year, day, date, time, location_site, ntu)|>
  filter(location_site == "8" | location_site == "12" | location_site == "30"| location_site == "31")
turb_25_clean$location_site <- as.factor(turb_25_clean$location_site)
turb_25_mean <- turb_25_clean|>
  drop_na()|>
  group_by(location_site)|>
  summarize(meanturb25 = mean(ntu),
            sd = sd(ntu),
            n=n(),
            se=(sd/sqrt(n)))
turb_25_mean

also calculate the overall mean in-lake turbidity for 2025

avg_turb_all25 <- turb_25_clean|>
  drop_na()|>
  summarize(meanturb25 = mean(ntu),
            sd = sd(ntu),
            n=n(),
            se=(sd/sqrt(n)))
avg_turb_all25
turb_25_means<- ggplot(data=turb_25_mean, aes(x=location_site, y=meanturb25))+
  geom_jitter(data=turb_25_clean, aes(x=location_site, y=ntu), alpha=.8, col='orange')+
  geom_point(col='darkblue', size =2)+
  geom_errorbar(data=turb_25_mean, aes(x=location_site, ymin=meanturb25-se, ymax=meanturb25+se), width=.4, col='darkblue')+
  labs(x='Site', y='Mean Turbidity (NTU)', title='Mean Turbidity of In-Lake Samples in Lake Auburn in 2025')+
  theme_bw()+
  geom_point(data=avg_turb_all25, aes(x = 2.5, y = meanturb25), color = "red", size = 2)+
  geom_errorbar(data=avg_turb_all25, aes(x=2.5, ymin=meanturb25-se, ymax=meanturb25+se), width=.4, col='red')+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.title = element_text(size = 16))
turb_25_means
Warning: Removed 113 rows containing missing values or values outside the scale range
(`geom_point()`).