CRAP: Colne River Assessment Project.

This data was collected by Volunteers, myself, Drew Henderson, and Rob Furguson, a lecturer at the University of Essex, who is leading the project.

(No date) Colne River Action Plan (CRAP). Available at: https://epigenetics.essex.ac.uk/shiny/crap/ (Accessed: 23 August 2024).

Introduction- The Colne River Assessment Project (CRAP) is an ongoing initiative focused on monitoring and assessing the biodiversity and water quality in various waterways across East Essex and Suffolk. Over the past year, the project collected water samples from key water bodies, including the Colne, Stour, Hythe, and Tollesbury rivers. These samples were analyzed to evaluate the levels of E. coli, Enterococcus, and antimicrobial-resistant bacteria (AMR) such as Extended-Spectrum Beta-Lactamase (ESBL) and Vancomycin-Resistant Enterococci (VRE). The project involved volunteers and experts like Drew Henderson and Rob Furguson from the University of Essex. The primary goal was to understand how environmental factors, particularly tide conditions, impact bacterial contamination and to compare the microbial levels against UK inland bathing water standards.

# Install necessary packages
install.packages("googlesheets4")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("googledrive")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
# Load libraries
library(googlesheets4)
library(googledrive)
## 
## Attaching package: 'googledrive'
## The following objects are masked from 'package:googlesheets4':
## 
##     request_generate, request_make
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the necessary libraries
library(googlesheets4)

# De-authenticate to allow access to public sheets
gs4_deauth()

# Load the data from the public Google Sheets URL
data <- read_sheet(
  "https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing",
  col_names = TRUE,
  na = c("NC", "NA"),  # Optional: Replace "NC" or "NA" with proper NAs
  range = "A:W"  # Optional: Specify range if necessary
)
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
# Display the first few rows of the data
head(data)
## # A tibble: 6 × 23
##   sampleID siteName               siteID `dateCollected (dd/mm/yy)` Waterway
##   <chr>    <chr>                  <chr>  <dttm>                     <chr>   
## 1 AA01     CampusLake             WP_1   2023-07-05 00:00:00        <NA>    
## 2 AA02     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 3 AA03     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 4 AA04     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 5 AA05     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## 6 AA06     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## # ℹ 18 more variables: `timeCollected (hh:mm)` <dttm>,
## #   `highTide (hh:mm)` <list>, `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>,
## #   collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## #   `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>, waterSample <dbl>,
## #   sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## #   ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## #   entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Load the data from Google Sheets
# Ensure you have permission to access this sheet or shared properly
data <- read_sheet(
  "https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing",
  col_names = TRUE,
  na = c("NC", "NA"),
  range = "A:W"
)
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
# Display the first few rows of the data to inspect
head(data)
## # A tibble: 6 × 23
##   sampleID siteName               siteID `dateCollected (dd/mm/yy)` Waterway
##   <chr>    <chr>                  <chr>  <dttm>                     <chr>   
## 1 AA01     CampusLake             WP_1   2023-07-05 00:00:00        <NA>    
## 2 AA02     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 3 AA03     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 4 AA04     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 5 AA05     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## 6 AA06     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## # ℹ 18 more variables: `timeCollected (hh:mm)` <dttm>,
## #   `highTide (hh:mm)` <list>, `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>,
## #   collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## #   `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>, waterSample <dbl>,
## #   sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## #   ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## #   entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Select specific columns and exclude "highTide (hh:mm)"
data <- read_sheet("https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing", col_names = TRUE, na = c("NC", "NA"), range = "A:W")
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
data <- data %>% dplyr::select(!"highTide (hh:mm)")

# Display the first few rows to inspect the data structure
head(data)
## # A tibble: 6 × 22
##   sampleID siteName               siteID `dateCollected (dd/mm/yy)` Waterway
##   <chr>    <chr>                  <chr>  <dttm>                     <chr>   
## 1 AA01     CampusLake             WP_1   2023-07-05 00:00:00        <NA>    
## 2 AA02     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 3 AA03     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 4 AA04     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
## 5 AA05     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## 6 AA06     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>,
## #   `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>, collector <chr>,
## #   `latitude (xx.xxxxxx)` <dbl>, `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>,
## #   waterSample <dbl>, sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## #   ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## #   entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Print column names
names(data)
##  [1] "sampleID"                  "siteName"                 
##  [3] "siteID"                    "dateCollected (dd/mm/yy)" 
##  [5] "Waterway"                  "timeCollected (hh:mm)"    
##  [7] "TimeSinceHighTide (hh:mm)" "tide"                     
##  [9] "collector"                 "latitude (xx.xxxxxx)"     
## [11] "longitude (x.xxxxxx)"      "PlusCode"                 
## [13] "waterSample"               "sedimentSample"           
## [15] "daysSinceRain"             "notes"                    
## [17] "ColiformsNonEC"            "E.coli"                   
## [19] "TotalColiforms"            "entericEnterococci"       
## [21] "ESBL"                      "VRE"
# Print entire dataset to console (useful for debugging)
print(data)
## # A tibble: 297 × 22
##    sampleID siteName               siteID `dateCollected (dd/mm/yy)` Waterway
##    <chr>    <chr>                  <chr>  <dttm>                     <chr>   
##  1 AA01     CampusLake             WP_1   2023-07-05 00:00:00        <NA>    
##  2 AA02     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
##  3 AA03     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
##  4 AA04     Wivenhoe tidal barrier Co_1   2023-07-25 00:00:00        Colne   
##  5 AA05     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
##  6 AA06     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
##  7 AA07     Alresford creek        Co_2   2023-07-25 00:00:00        Colne   
##  8 AA08     buoy 17                Co_4   2023-07-25 00:00:00        Colne   
##  9 AA09     buoy 17                Co_4   2023-07-25 00:00:00        Colne   
## 10 AA10     buoy 17                Co_4   2023-07-25 00:00:00        Colne   
## # ℹ 287 more rows
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>,
## #   `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>, collector <chr>,
## #   `latitude (xx.xxxxxx)` <dbl>, `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>,
## #   waterSample <dbl>, sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## #   ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## #   entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# View data in a spreadsheet-like viewer in RStudio
View(data)
# Data cleaning and transformation
data_cleaned <- data %>%
  # Convert date to Date format
  mutate(`dateCollected (dd/mm/yy)` = as.Date(`dateCollected (dd/mm/yy)`, format = "%d/%m/%y")) %>%
  # Convert time to POSIXct for accurate time handling
  mutate(`timeCollected (hh:mm)` = as.POSIXct(`timeCollected (hh:mm)`, format = "%H:%M")) %>%
  # Convert bacteria counts to numeric for statistical analysis
  mutate(across(c(`E.coli`, `TotalColiforms`, entericEnterococci, ESBL, VRE), as.numeric)) %>%
  # Create a factor column for tide phases with correct levels
  mutate(tide = factor(tide, levels = c("ebb", "flood", "slack", "NA")),
         Waterway = as.factor(Waterway)) %>%
  # Filter out rows where Waterway is NA
  filter(!is.na(Waterway))

# Boxplot for Total Coliforms, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = `TotalColiforms`, fill = Waterway)) +
  geom_boxplot() +
  geom_jitter(aes(color = Waterway), width = 0.2) +
  labs(
    title = "Comparison of Total Coliforms by Tide and Waterway",
    x = "Tide Condition",
    y = "Total Coliforms (cfu/100ml)"
  ) +
  theme_minimal()
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 227 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot for VRE levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = VRE, fill = Waterway)) +
  geom_boxplot() +
  geom_jitter(aes(color = Waterway), width = 0.2) +
  labs(
    title = "Comparison of VRE by Tide and Waterway",
    x = "Tide Condition",
    y = "VRE Levels"
  ) +
  theme_minimal()
## Warning: Removed 273 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 273 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot for ESBL levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = ESBL, fill = Waterway)) +
  geom_boxplot() +
  geom_jitter(aes(color = Waterway), width = 0.2) +
  labs(
    title = "Comparison of ESBL by Tide and Waterway",
    x = "Tide Condition",
    y = "ESBL Levels"
  ) +
  theme_minimal()
## Warning: Removed 275 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 275 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot for E. coli levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = `E.coli`, fill = Waterway)) +
  geom_boxplot() +
  geom_jitter(aes(color = Waterway), width = 0.2) +
  labs(
    title = "Comparison of E. coli Levels by Tide and Waterway",
    x = "Tide Condition",
    y = "E. coli (cfu/100ml)"
  ) +
  theme_minimal()
## Warning: Removed 188 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 188 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot for Enterococci levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = entericEnterococci, fill = Waterway)) +
  geom_boxplot() +
  geom_jitter(aes(color = Waterway), width = 0.2) +
  labs(
    title = "Comparison of Enteric Enterococci Levels by Tide and Waterway",
    x = "Tide Condition",
    y = "Enteric Enterococci (cfu/100ml)"
  ) +
  theme_minimal()
## Warning: Removed 186 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 186 rows containing missing values or values outside the scale range
## (`geom_point()`).

#### Figures 1, 2, 3, 4, 5. This multi-figure set illustrates the comparative concentrations of various bacterial contaminants, including Total Coliforms, VRE (Vancomycin-Resistant Enterococci), ESBL (Extended Spectrum Beta-Lactamase), E. coli, and Enteric Enterococci, across four waterways (Blackwater, Colne, Roman, Stour) under different tide conditions (ebb, flood, slack, and NA). All data are presented as cfu/100ml (colony-forming units per 100 milliliters). The tidal phases represent different stages of water movement that impact bacterial concentrations, and the waterways are color-coded: Blackwater (red), Colne (green), Roman (blue), Stour (purple), and NA (black) for unspecified or unidentified waterways. The figures show distinct patterns of contamination. Colne had the highest Total Coliforms during flood tide, with lower concentrations in Blackwater and Stour, while Roman showed minimal variability. VRE concentrations were highest in Stour during ebb tide, followed by Colne and Blackwater during flood, with Roman consistently lower. ESBL levels peaked in Blackwater during flood tide, with moderate contamination in Colne and lower levels in Roman and Stour. E. coli concentrations were highest in Stour during slack tide, with moderate contamination in Colne and consistently lower levels in Blackwater and Roman. Enteric Enterococci levels were elevated in Blackwater during flood tide and in Stour during slack tide, while Colne and Roman remained lower throughout. These results indicate that Colne and Stour tend to exhibit higher bacterial levels across categories, particularly during slack and flood tides. The presence of antibiotic-resistant bacteria (VRE and ESBL) underscores the environmental and public health concerns associated with these waterways. This combined legend contextualizes the figures, linking bacterial contamination to waterway and tide conditions.

# Filter out NA values for Waterway column
data_filtered <- data %>% filter(!is.na(Waterway))

# Plot E. coli levels by tide and waterway with thresholds and annotations using the filtered data
ggplot(data_filtered, aes(x = tide, y = `E.coli`, fill = Waterway)) +
  geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
  scale_y_log10() +  # Logarithmic scale for better visualization
  geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) +   # Excellent threshold
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) +    # Good threshold
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) +  # Sufficient threshold
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) +    # Poor threshold
  
  # Add text annotations for thresholds
  annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
  
  # Set labels and theme options
  labs(title = "E. coli Levels by Tide Phase and Waterway",
       y = "E. coli (cfu/100ml)",
       x = "Tide Phase",
       fill = "Waterway") +
  scale_fill_manual(values = c("Blackwater" = "salmon", 
                               "Colne" = "green3", 
                               "Roman" = "purple", 
                               "Stour" = "blue")) +  # Removed NA from custom colors for waterways
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate tide labels for better visibility
        plot.title = element_text(size = 16),               # Adjust title size
        axis.title = element_text(size = 14),               # Adjust axis label size
        legend.title = element_text(size = 12),             # Adjust legend title size
        legend.text = element_text(size = 10))              # Adjust legend text size
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 6. This figure shows the E. coli levels (cfu/100ml) across different waterways (Blackwater, Colne, Roman, Stour, and NA) during various tidal phases (ebb, flood, slack, and NA). The waterways are represented by different colors: Blackwater (red), Colne (green), Roman (purple), Stour (blue), and NA (grey). The graph uses a logarithmic scale to better visualize the wide range of bacterial concentrations. Dashed horizontal lines indicate water quality thresholds: Excellent (green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange, 600 cfu/100ml), and Poor (red, 1000+ cfu/100ml). Stour shows notably high E. coli levels during slack tide, while Colne has moderate levels. Both Roman and Blackwater exhibit lower contamination across all tidal phases.

# Filter out NA values for Waterway column
data_filtered <- data %>% filter(!is.na(Waterway))

# Updated ggplot code using the filtered data
ggplot(data_filtered, aes(x = tide, y = `TotalColiforms`, fill = Waterway)) +
  geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
  scale_y_log10() +  # Logarithmic scale
  geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) +   # Excellent threshold
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) +    # Good threshold
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) +  # Sufficient threshold
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) +    # Poor threshold
  
  annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
  
  labs(title = "Total Coliforms Levels by Tide Phase and Waterway",
       y = "Total Coliforms (cfu/100ml)",
       x = "Tide Phase",
       fill = "Waterway") +
  scale_fill_manual(values = c("Blackwater" = "salmon", 
                               "Colne" = "green3", 
                               "Roman" = "purple", 
                               "Stour" = "blue")) +  # NA removed from the fill scale
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 16),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 233 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 7.This figure presents Total Coliform levels (cfu/100ml) across different waterways (Blackwater, Colne, Roman, Stour, and NA) during varying tidal phases (ebb, flood, slack, and NA). The waterways are distinguished by colors: Blackwater (red), Colne (green), Roman (purple), Stour (blue), and NA (grey). A logarithmic scale is used for the vertical axis to accommodate a wide range of bacterial concentrations. The dashed horizontal lines mark water quality thresholds: Excellent (green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange, 600 cfu/100ml), and Poor (red, 1000+ cfu/100ml). Colne consistently exhibits higher Total Coliform levels, particularly during ebb tide, while other waterways, such as Blackwater and Roman, display lower concentrations. Total Coliform levels generally stay within or near the thresholds for Good or Sufficient quality across most waterways and tide phases, except for a few outliers above the Poor threshold.

# Load ggplot2 for plotting and dplyr for data manipulation
ggplot(data, aes(x = tide, y = ESBL, fill = Waterway)) +
  
  # Create a boxplot with formatted outliers
  geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
  
  # Apply logarithmic scale to y-axis to manage data spread
  scale_y_log10() +  
  
  # Add horizontal lines to show thresholds for water quality
  geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) +   # Excellent threshold
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) +    # Good threshold
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) +  # Sufficient threshold
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) +    # Poor threshold
  
  # Annotate the threshold lines with text
  annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
  
  # Set plot labels for title, y-axis, and x-axis
  labs(title = "ESBL Levels by Tide Phase and Waterway",
       y = "ESBL (cfu/100ml)",
       x = "Tide Phase",
       fill = "Waterway") +
  
  # Custom colors for each waterway
  scale_fill_manual(values = c("Blackwater" = "salmon", 
                               "Colne" = "green3", 
                               "Roman" = "purple", 
                               "Stour" = "blue", 
                               "NA" = "grey")) +
  
  # Apply minimal theme and rotate x-axis labels
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 16),     # Set plot title size
        axis.title = element_text(size = 14),     # Set axis title size
        legend.title = element_text(size = 12),   # Set legend title size
        legend.text = element_text(size = 10))    # Set legend text size
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 8. This figure shows ESBL (Extended Spectrum Beta-Lactamase) bacterial levels (cfu/100ml) across various waterways—Blackwater (red), Colne (green), Roman (purple), and Stour (blue)—during different tide phases (ebb, flood, slack, and NA). The chart uses a logarithmic scale to illustrate a wide range of bacterial concentrations. Dashed horizontal lines represent water quality thresholds: Excellent (green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange, 600 cfu/100ml), and Poor (red, 1000+ cfu/100ml). ESBL levels are highest in Blackwater during ebb tide, with Colne also showing elevated levels, particularly during flood tide. Roman and Stour have lower ESBL concentrations, often falling within the thresholds for Excellent or Good quality.

ggplot(data, aes(x = tide, y = ESBL, fill = Waterway)) +
  geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
  scale_y_log10() +  # Logarithmic scale
  geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) +   # Excellent threshold
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) +    # Good threshold
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) +  # Sufficient threshold
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) +    # Poor threshold
  
  annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
  annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
  
  labs(title = "ESBL Levels by Tide Phase and Waterway",
       y = "ESBL (cfu/100ml)",
       x = "Tide Phase",
       fill = "Waterway") +
  scale_fill_manual(values = c("Blackwater" = "salmon", 
                               "Colne" = "green3", 
                               "Roman" = "purple", 
                               "Stour" = "blue", 
                               "NA" = "grey")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 16),
        axis.title = element_text(size = 14),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 9. This graph displays the levels of ESBL (Extended Spectrum Beta-Lactamase) bacteria across different waterways (Blackwater, Colne, Roman, and Stour) during various tidal phases (ebb, flood, slack, and NA). The bacterial concentrations are measured in cfu/100ml. The horizontal dashed lines represent water quality thresholds: Excellent (green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange, 600 cfu/100ml), and Poor (red, above 1000 cfu/100ml). ESBL levels are highest in Blackwater during the ebb tide, with the flood tide in the Colne waterway also showing elevated levels that approach the Poor threshold. Roman and Stour waterways generally exhibit lower ESBL levels.

# Convert date and time columns to appropriate types
data$`dateCollected (dd/mm/yy)` <- as.Date(data$`dateCollected (dd/mm/yy)`, format = "%d/%m/%y")
data$`timeCollected (hh:mm)` <- as.POSIXct(data$`timeCollected (hh:mm)`, format = "%H:%M")

# Extract and order months
data$month <- format(data$`dateCollected (dd/mm/yy)`, "%B")
data$month <- factor(data$month, levels = c("July", "August", "September", "October", "November", "December", "January", "February", "March", "April", "May", "June"))

# Convert bacteria columns to numeric
data <- data %>%
  mutate(across(c(`E.coli`, `TotalColiforms`, entericEnterococci, ESBL, VRE), as.numeric))

# Remove unnecessary columns
data <- data %>%
  select(-c(`TimeSinceHighTide (hh:mm)`, `PlusCode`, notes))
# Summary statistics for bacterial concentrations
summary_stats <- data %>%
  summarise(
    E.coli_mean = mean(`E.coli`, na.rm = TRUE),
    entericEnterococci_mean = mean(entericEnterococci, na.rm = TRUE),
    ESBL_mean = mean(ESBL, na.rm = TRUE),
    VRE_mean = mean(VRE, na.rm = TRUE), 
    TotalColiforms_mean = mean(TotalColiforms, na.rm = TRUE),
    ColiformsNonEC_mean = mean(ColiformsNonEC, na.rm = TRUE)
  )

summary_stats
## # A tibble: 1 × 6
##   E.coli_mean entericEnterococci_mean ESBL_mean VRE_mean TotalColiforms_mean
##         <dbl>                   <dbl>     <dbl>    <dbl>               <dbl>
## 1        993.                    618.     1213.     461.                783.
## # ℹ 1 more variable: ColiformsNonEC_mean <dbl>
# Average concentrations by Waterway
average_concentrations <- data %>%
  group_by(Waterway) %>%
  summarise(
    Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
    Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
    Avg_ESBL = mean(ESBL, na.rm = TRUE),
    Avg_VRE = mean(VRE, na.rm = TRUE)
  )

# Display the result
average_concentrations
## # A tibble: 6 × 5
##   Waterway     Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
##   <chr>             <dbl>           <dbl>    <dbl>   <dbl>
## 1 ""                NaN              NaN     NaN      NaN 
## 2 "Blackwater"       44.3            634     764      236 
## 3 "Colne"           585.             394.   1635.     384.
## 4 "Roman"          1072              496    1088       80 
## 5 "Stour"          3218.            1575.     21.3   1052 
## 6  <NA>              42               81     NaN      NaN
# Define thresholds
ec_thresholds <- c(Excellent = 250, Good = 500, Sufficient = 500, Poor = Inf)
enterococci_thresholds <- c(Excellent = 100, Good = 200, Sufficient = 185, Poor = Inf)

# Compare sites against thresholds
data <- data %>%
  mutate(
    E.coli_quality = case_when(
      `E.coli` <= ec_thresholds["Excellent"] ~ "Excellent",
      `E.coli` <= ec_thresholds["Good"] ~ "Good",
      `E.coli` <= ec_thresholds["Sufficient"] ~ "Sufficient",
      TRUE ~ "Poor"
    ),
    entericEnterococci_quality = case_when(
      entericEnterococci <= enterococci_thresholds["Excellent"] ~ "Excellent",
      entericEnterococci <= enterococci_thresholds["Good"] ~ "Good",
      entericEnterococci <= enterococci_thresholds["Sufficient"] ~ "Sufficient",
      TRUE ~ "Poor"
    )
  )

# Display the updated dataset with water quality categories
data
## # A tibble: 297 × 22
##    sampleID siteName               siteID `dateCollected (dd/mm/yy)` Waterway
##    <chr>    <chr>                  <chr>  <date>                     <chr>   
##  1 AA01     CampusLake             WP_1   2023-07-05                 <NA>    
##  2 AA02     Wivenhoe tidal barrier Co_1   2023-07-25                 Colne   
##  3 AA03     Wivenhoe tidal barrier Co_1   2023-07-25                 Colne   
##  4 AA04     Wivenhoe tidal barrier Co_1   2023-07-25                 Colne   
##  5 AA05     Alresford creek        Co_2   2023-07-25                 Colne   
##  6 AA06     Alresford creek        Co_2   2023-07-25                 Colne   
##  7 AA07     Alresford creek        Co_2   2023-07-25                 Colne   
##  8 AA08     buoy 17                Co_4   2023-07-25                 Colne   
##  9 AA09     buoy 17                Co_4   2023-07-25                 Colne   
## 10 AA10     buoy 17                Co_4   2023-07-25                 Colne   
## # ℹ 287 more rows
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>, tide <chr>,
## #   collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## #   `longitude (x.xxxxxx)` <dbl>, waterSample <dbl>, sedimentSample <dbl>,
## #   daysSinceRain <dbl>, ColiformsNonEC <dbl>, E.coli <dbl>,
## #   TotalColiforms <dbl>, entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>,
## #   month <fct>, E.coli_quality <chr>, entericEnterococci_quality <chr>
View(data)
quality_counts <- data %>%
  group_by(month, E.coli_quality) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
quality_counts_filtered <- quality_counts %>%
  filter(!is.na(month) & !is.na(E.coli_quality))

# E. coli Quality Counts by Month
ggplot(quality_counts_filtered, aes(x = month, y = count, fill = E.coli_quality)) +
  geom_col(position = "dodge") +
  labs(title = "E. coli Quality Classification Counts by Month", y = "Count", x = "Month") +
  scale_fill_manual(values = c("Excellent" = "lightblue", "Good" = "blue", "Sufficient" = "#D2B48C", "Poor" = "brown"))

quality_counts <- data %>%
  group_by(month, entericEnterococci_quality) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
quality_counts_filtered <- quality_counts %>%
  filter(!is.na(month) & !is.na(entericEnterococci_quality))

# Enterococci Quality Counts by Month
ggplot(quality_counts_filtered, aes(x = month, y = count, fill = entericEnterococci_quality)) +
  geom_col(position = "dodge") +
  labs(title = "Enterococci Quality Classification Counts by Month", y = "Count", x = "Month") +
  scale_fill_manual(values = c("Excellent" = "lightblue", "Good" = "blue", "Sufficient" = "#D2B48C", "Poor" = "brown"))

#### Figures 10 & 11. These two figures present the monthly distribution of water quality classifications for E. coli and Enterococci, based on contamination thresholds of Excellent (light blue), Good (blue), and Poor (red). In both figures, August exhibits the highest number of Excellent quality counts, while Poor quality levels are consistently observed across July, September, and October. Good quality counts remain relatively low for both bacteria, with slight increases observed in September and October. The figures highlight that August stands out for its superior water quality in terms of E. coli and Enterococci, while Poor classifications are spread more evenly across the other months, especially during mid-summer and early fall.

# Summary of findings
cat("Summary of E.coli concentrations across sites:")
## Summary of E.coli concentrations across sites:
summary(data$`E.coli`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     8.0    86.0   993.3   568.0 22160.0     188
cat("Summary of Enterococci concentrations across sites:")
## Summary of Enterococci concentrations across sites:
summary(data$entericEnterococci)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     4.0    82.0   618.2   395.0  9760.0     186
# Average concentrations by Waterway
average_concentrations <- data %>%
  group_by(Waterway) %>%
  summarise(
    Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
    Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
    Avg_ESBL = mean(ESBL, na.rm = TRUE),
    Avg_VRE = mean(VRE, na.rm = TRUE)
  )

average_concentrations
## # A tibble: 6 × 5
##   Waterway     Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
##   <chr>             <dbl>           <dbl>    <dbl>   <dbl>
## 1 ""                NaN              NaN     NaN      NaN 
## 2 "Blackwater"       44.3            634     764      236 
## 3 "Colne"           585.             394.   1635.     384.
## 4 "Roman"          1072              496    1088       80 
## 5 "Stour"          3218.            1575.     21.3   1052 
## 6  <NA>              42               81     NaN      NaN
# Summary table of average concentrations by waterway and month
average_concentrations_by_waterway_month <- data %>%
  group_by(Waterway, month) %>%
  summarise(
    Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
    Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
    Avg_ESBL = mean(ESBL, na.rm = TRUE),
    Avg_VRE = mean(VRE, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'Waterway'. You can override using the
## `.groups` argument.
average_concentrations_by_waterway_month
## # A tibble: 12 × 6
## # Groups:   Waterway [6]
##    Waterway     month     Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
##    <chr>        <fct>          <dbl>           <dbl>    <dbl>   <dbl>
##  1 ""           <NA>           NaN              NaN     NaN      NaN 
##  2 "Blackwater" August          31.3             16     NaN      NaN 
##  3 "Blackwater" September       96             2488     764      236 
##  4 "Colne"      July          1094.             980     NaN      NaN 
##  5 "Colne"      August         298.             101     NaN      NaN 
##  6 "Colne"      September      718.             332.   1635.     384.
##  7 "Colne"      October         65.6             24     NaN      NaN 
##  8 "Roman"      October       1072              496    1088       80 
##  9 "Stour"      August        4204.            1971.    NaN      NaN 
## 10 "Stour"      September      771.             876     NaN      NaN 
## 11 "Stour"      October       3564              736      21.3   1052 
## 12  <NA>        July            42               81     NaN      NaN
# Define thresholds for inland bathing water standards
inland_thresholds <- list(
  E.coli = c(Excellent = 500, Good = 1000, Sufficient = 900, Poor = Inf),
  enterococci = c(Excellent = 200, Good = 400, Sufficient = 330, Poor = Inf)
)

# Replace "NC" with NA in character columns only
data <- data %>%
  mutate(across(where(is.character), ~na_if(., "NC")))

# Filter out rows with NA values for Waterway (or any other relevant column)
data_filtered <- data %>% filter(!is.na(Waterway))

# Function to plot with inland thresholds and log scale
plot_with_thresholds_log <- function(data, y_var, thresholds, title, y_label) {
  ggplot(data, aes(x = month, y = !!sym(y_var), fill = Waterway)) +
    geom_boxplot() +
    geom_hline(yintercept = thresholds["Excellent"], linetype = "dashed", color = "green", linewidth = 1) +
    geom_hline(yintercept = thresholds["Good"], linetype = "dashed", color = "blue", linewidth = 1) +
    geom_hline(yintercept = thresholds["Sufficient"], linetype = "dashed", color = "orange", linewidth = 1) +
    geom_hline(yintercept = max(data[[y_var]], na.rm = TRUE), linetype = "dotted", color = "red", linewidth = 1) +  # Poor line
    scale_y_log10() +  # Apply log scale to y-axis
    labs(title = title, y = y_label, x = "Month") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    annotate("text", x = Inf, y = thresholds["Excellent"], label = "Excellent", color = "green", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = thresholds["Good"], label = "Good", color = "blue", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = thresholds["Sufficient"], label = "Sufficient", color = "orange", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = max(data[[y_var]], na.rm = TRUE), label = "Poor", color = "red", hjust = 1.1, vjust = 2)
}

# E. coli plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "E.coli", inland_thresholds$`E.coli`, "E. coli Levels by Waterway and Month", "E. coli (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Enterococci plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "entericEnterococci", inland_thresholds$enterococci, "Enterococci Levels by Waterway and Month", "Enterococci (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 211 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Plot for resistant bacteria (ESBL and VRE) with Site ID (filtered data)
ggplot(data_filtered, aes(x = month, fill = Waterway)) +
  geom_boxplot(aes(y = ESBL, fill = "ESBL"), color = "darkred") +
  geom_boxplot(aes(y = VRE, fill = "VRE"), color = "darkblue", alpha = 0.5) +
  scale_y_log10() +  # Apply log scale to y-axis
  labs(title = "Resistant Bacteria (ESBL & VRE) Levels by Waterway and Month", y = "Concentration (cfu/100ml)", fill = "Bacteria Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 280 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 12, 13, 14. Figures 12, 13, and 14 compare bacterial contamination levels across different waterways (Blackwater, Colne, Roman, Stour) by month, highlighting trends for E. coli, Enterococci, and antibiotic-resistant bacteria (ESBL and VRE). In Figure 12, E. coli levels show varying trends, with peaks observed in different waterways across July to October, and thresholds for Excellent, Good, Sufficient, and Poor water quality are marked on the graph. Similarly, Figure 13 presents Enterococci levels by month and waterway, with significant variability between months and waterways, with Blackwater showing elevated levels in July, and Colne in September. Threshold lines for water quality classification are also indicated. Figure 14 focuses on antibiotic-resistant bacteria, comparing ESBL and VRE concentrations across months. While both resistant bacteria types are present, VRE shows a higher concentration, particularly in September.

# Define thresholds for inland bathing water standards
inland_thresholds <- list(
  E.coli = c(Excellent = 500, Good = 1000, Sufficient = 900, Poor = Inf),
  enterococci = c(Excellent = 200, Good = 400, Sufficient = 330, Poor = Inf)
)

# Replace "NC" with NA in character columns only
data <- data %>%
  mutate(across(where(is.character), ~na_if(., "NC")))

# Filter out rows with NA values for Waterway (or any other relevant column)
data_filtered <- data %>% filter(!is.na(Waterway))

# Function to plot with inland thresholds and log scale
plot_with_thresholds_log <- function(data, y_var, thresholds, title, y_label) {
  ggplot(data, aes(x = siteID, y = !!sym(y_var), fill = Waterway)) +
    geom_boxplot() +
    geom_hline(yintercept = thresholds["Excellent"], linetype = "dashed", color = "green", linewidth = 1) +
    geom_hline(yintercept = thresholds["Good"], linetype = "dashed", color = "blue", linewidth = 1) +
    geom_hline(yintercept = thresholds["Sufficient"], linetype = "dashed", color = "orange", linewidth = 1) +
    geom_hline(yintercept = max(data[[y_var]], na.rm = TRUE), linetype = "dotted", color = "red", linewidth = 1) +  # Poor line
    scale_y_log10() +  # Apply log scale to y-axis
    labs(title = title, y = y_label, x = "Site ID") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    annotate("text", x = Inf, y = thresholds["Excellent"], label = "Excellent", color = "green", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = thresholds["Good"], label = "Good", color = "blue", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = thresholds["Sufficient"], label = "Sufficient", color = "orange", hjust = 1.1, vjust = -0.5) +
    annotate("text", x = Inf, y = max(data[[y_var]], na.rm = TRUE), label = "Poor", color = "red", hjust = 1.1, vjust = 2)
}

# E. coli plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "E.coli", inland_thresholds$`E.coli`, "E. coli Levels by Site ID", "E. coli (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Enterococci plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "entericEnterococci", inland_thresholds$enterococci, "Enterococci Levels by Site ID", "Enterococci (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 211 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Plot for resistant bacteria (ESBL and VRE) with Site ID (filtered data)
ggplot(data_filtered, aes(x = siteID, fill = Waterway)) +
  geom_boxplot(aes(y = ESBL, fill = "ESBL"), color = "darkred") +
  geom_boxplot(aes(y = VRE, fill = "VRE"), color = "darkblue", alpha = 0.5) +
  scale_y_log10() +  # Apply log scale to y-axis
  labs(title = "Resistant Bacteria (ESBL & VRE) Levels by Site ID", y = "Concentration (cfu/100ml)", fill = "Bacteria Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 280 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 15,16,17. In Figures 15, 16, and 17, we analyze the concentrations of E. coli, Enterococci, and resistant bacteria (ESBL and VRE) across various site IDs for the Blackwater, Colne, Roman, and Stour waterways. Each figure visualizes bacterial concentration data in cfu/100ml using box plots for individual site IDs, overlaid with water quality thresholds: Excellent, Good, Sufficient, and Poor. In the E. coli figure, high concentrations are observed in sites like Stour (S1, S3) and Colne (C9), surpassing the Poor quality threshold, while most other sites fall within the Sufficient or Good thresholds. Enterococci concentrations follow a similar pattern, with several sites exceeding Poor thresholds, particularly in Stour and Colne, indicating substantial contamination at certain points. Resistant bacteria (ESBL and VRE) show a diverse distribution, with ESBL concentrations significantly higher at some sites (e.g., Hy1 and Hy2) while VRE levels are more sporadic, indicating variability in antibiotic-resistant bacterial presence across sites.

# Manually input the data for each sample
data <- data.frame(
  Date = factor(c("25-07-2023", "25-07-2023", "25-07-2023", "10-08-2023", "10-08-2023", "10-08-2023", 
                  "11-10-2023", "11-10-2023", "08-12-2023", "01-03-2024", "27-04-2024"),
                levels = c("25-07-2023", "10-08-2023", "11-10-2023", "08-12-2023", "01-03-2024", "27-04-2024")),
  Enterococci = c(432, 688, 540, 40, 160, 40, 31, 22, NA, 290, 20),
  E_coli = c(820, 1016, 728, 40, 160, 40, 52, 304, 3280, 480, 1410)
)

# Melt the data to long format for ggplot
library(reshape2)
data_long <- melt(data, id.vars = "Date", variable.name = "Bacteria", value.name = "Count")

# Create the boxplot
ggplot(data_long, aes(x = Date, y = Count)) +
  geom_boxplot(aes(fill = Bacteria)) +
  geom_hline(yintercept = 200, linetype = "dashed", color = "green") + # Example threshold for "Excellent"
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue") + # Example threshold for "Good"
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange") + # Example threshold for "Sufficient"
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red") + # Poor threshold
  labs(title = "Bacterial Levels by Date (Site Co_1)", y = "Bacteria Count (cfu/100ml)", x = "Date Collected") +
  facet_wrap(~Bacteria, scales = "free_y") +  # Facet by Bacteria type
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

#### Figure 18. This figure illustrates the bacterial contamination levels of Enterococci and E. coli at Site Co_1 over various sampling dates, with the x-axis displaying the collection dates (from July 2023 to April 2024) and the y-axis representing bacterial counts in cfu/100ml. The left panel shows Enterococci levels in red boxplots, while the right panel presents E. coli levels in teal boxplots. Both panels are overlaid with dashed lines indicating water quality thresholds: green for Excellent, blue for Good, orange for Sufficient, and red for Poor. The highest Enterococci concentrations were observed on 25-07-2023, exceeding the Sufficient threshold, followed by a significant drop on 19-08-2023, with consistently low levels afterward. For E. coli, the highest concentration occurred on 10-08-2023, exceeding the Good threshold, followed by a decrease below the Excellent threshold by 08-12-2023, and a subsequent spike on 27-04-2024 approaching the Poor threshold. The graph effectively visualizes changes in bacterial contamination and fluctuations in water quality at Site Co_1 over time.

# Extract relevant columns and convert date to month
data <- data.frame(
  Date = as.Date(c("2023-10-11", "2023-10-11", "2023-07-25", "2023-07-25", "2023-07-25",
                   "2023-08-10", "2023-08-10", "2023-08-10", "2023-10-11", "2023-10-11",
                   "2023-12-08", "2024-03-01", "2024-04-27", "2024-06-06", "2024-07-05"),
                 format = "%Y-%m-%d"),
  Enterococci = c(19, 9, 432, 688, 540, 0, 0, 0, 31, 22, 3280, 290, 20, 560, 200),
  E_coli = c(84, 34, 820, 1016, 728, 0, 0, 0, 52, 304, 0, 480, 2820, 720, 720)
)

# Convert Date to Month
data$Month <- format(data$Date, "%B")

# Melt the data to long format for ggplot
data_long <- melt(data, id.vars = "Month", variable.name = "Bacteria", value.name = "Count")
## Warning: attributes are not identical across measure variables; they will be
## dropped
# Reorder the levels of the Month factor
data_long$Month <- factor(data_long$Month, levels = month.name)

# Create the boxplot
ggplot(data_long, aes(x = Month, y = Count)) +
  geom_boxplot(aes(fill = Bacteria)) +
  geom_hline(yintercept = 200, linetype = "dashed", color = "green") + # Example threshold for "Excellent"
  geom_hline(yintercept = 400, linetype = "dashed", color = "blue") + # Example threshold for "Good"
  geom_hline(yintercept = 600, linetype = "dashed", color = "orange") + # Example threshold for "Sufficient"
  geom_hline(yintercept = 1000, linetype = "dashed", color = "red") + # Poor threshold
  labs(title = "Bacterial Levels by Month (Site Specific)", y = "Bacteria Count (cfu/100ml)", x = "Month") +
  facet_wrap(~Bacteria, scales = "free_y") +  # Facet by Bacteria type
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#### Figure 19. This figure presents bacterial levels of Enterococci and E. coli over different months at a specific site. The x-axis represents months from March to December, and the y-axis represents the bacterial count in cfu/100ml. The graph consists of three panels: the first for Date (shown in black), the second for Enterococci (green boxplots), and the third for E. coli (blue boxplots). The dashed lines correspond to water quality thresholds: green for Excellent, blue for Good, orange for Sufficient, and red for Poor. The Date panel shows constant values, with all counts around 20,000 cfu/100ml. The Enterococci panel shows concentrations peaking in July and remaining above the Poor threshold by December, while concentrations in other months remain below the Excellent threshold. Similarly, E. coli levels peak in July and gradually decrease by December, with some values exceeding the Poor threshold in August.