# Install necessary packages
install.packages("googlesheets4")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("googledrive")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
# Load libraries
library(googlesheets4)
library(googledrive)
##
## Attaching package: 'googledrive'
## The following objects are masked from 'package:googlesheets4':
##
## request_generate, request_make
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the necessary libraries
library(googlesheets4)
# De-authenticate to allow access to public sheets
gs4_deauth()
# Load the data from the public Google Sheets URL
data <- read_sheet(
"https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing",
col_names = TRUE,
na = c("NC", "NA"), # Optional: Replace "NC" or "NA" with proper NAs
range = "A:W" # Optional: Specify range if necessary
)
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
# Display the first few rows of the data
head(data)
## # A tibble: 6 × 23
## sampleID siteName siteID `dateCollected (dd/mm/yy)` Waterway
## <chr> <chr> <chr> <dttm> <chr>
## 1 AA01 CampusLake WP_1 2023-07-05 00:00:00 <NA>
## 2 AA02 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 3 AA03 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 4 AA04 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 5 AA05 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 6 AA06 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## # ℹ 18 more variables: `timeCollected (hh:mm)` <dttm>,
## # `highTide (hh:mm)` <list>, `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>,
## # collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## # `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>, waterSample <dbl>,
## # sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## # ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## # entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Load the data from Google Sheets
# Ensure you have permission to access this sheet or shared properly
data <- read_sheet(
"https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing",
col_names = TRUE,
na = c("NC", "NA"),
range = "A:W"
)
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
# Display the first few rows of the data to inspect
head(data)
## # A tibble: 6 × 23
## sampleID siteName siteID `dateCollected (dd/mm/yy)` Waterway
## <chr> <chr> <chr> <dttm> <chr>
## 1 AA01 CampusLake WP_1 2023-07-05 00:00:00 <NA>
## 2 AA02 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 3 AA03 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 4 AA04 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 5 AA05 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 6 AA06 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## # ℹ 18 more variables: `timeCollected (hh:mm)` <dttm>,
## # `highTide (hh:mm)` <list>, `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>,
## # collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## # `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>, waterSample <dbl>,
## # sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## # ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## # entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Select specific columns and exclude "highTide (hh:mm)"
data <- read_sheet("https://docs.google.com/spreadsheets/d/1Z1qwWgIIq1m1TmJi9HJ0JDe69xZCJkABx05OdabkYF8/edit?usp=sharing", col_names = TRUE, na = c("NC", "NA"), range = "A:W")
## ✔ Reading from "DataCollectionForm".
## ✔ Range 'A:W'.
data <- data %>% dplyr::select(!"highTide (hh:mm)")
# Display the first few rows to inspect the data structure
head(data)
## # A tibble: 6 × 22
## sampleID siteName siteID `dateCollected (dd/mm/yy)` Waterway
## <chr> <chr> <chr> <dttm> <chr>
## 1 AA01 CampusLake WP_1 2023-07-05 00:00:00 <NA>
## 2 AA02 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 3 AA03 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 4 AA04 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 5 AA05 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 6 AA06 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>,
## # `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>, collector <chr>,
## # `latitude (xx.xxxxxx)` <dbl>, `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>,
## # waterSample <dbl>, sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## # ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## # entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# Print column names
names(data)
## [1] "sampleID" "siteName"
## [3] "siteID" "dateCollected (dd/mm/yy)"
## [5] "Waterway" "timeCollected (hh:mm)"
## [7] "TimeSinceHighTide (hh:mm)" "tide"
## [9] "collector" "latitude (xx.xxxxxx)"
## [11] "longitude (x.xxxxxx)" "PlusCode"
## [13] "waterSample" "sedimentSample"
## [15] "daysSinceRain" "notes"
## [17] "ColiformsNonEC" "E.coli"
## [19] "TotalColiforms" "entericEnterococci"
## [21] "ESBL" "VRE"
# Print entire dataset to console (useful for debugging)
print(data)
## # A tibble: 297 × 22
## sampleID siteName siteID `dateCollected (dd/mm/yy)` Waterway
## <chr> <chr> <chr> <dttm> <chr>
## 1 AA01 CampusLake WP_1 2023-07-05 00:00:00 <NA>
## 2 AA02 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 3 AA03 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 4 AA04 Wivenhoe tidal barrier Co_1 2023-07-25 00:00:00 Colne
## 5 AA05 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 6 AA06 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 7 AA07 Alresford creek Co_2 2023-07-25 00:00:00 Colne
## 8 AA08 buoy 17 Co_4 2023-07-25 00:00:00 Colne
## 9 AA09 buoy 17 Co_4 2023-07-25 00:00:00 Colne
## 10 AA10 buoy 17 Co_4 2023-07-25 00:00:00 Colne
## # ℹ 287 more rows
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>,
## # `TimeSinceHighTide (hh:mm)` <dttm>, tide <chr>, collector <chr>,
## # `latitude (xx.xxxxxx)` <dbl>, `longitude (x.xxxxxx)` <dbl>, PlusCode <chr>,
## # waterSample <dbl>, sedimentSample <dbl>, daysSinceRain <dbl>, notes <chr>,
## # ColiformsNonEC <dbl>, E.coli <dbl>, TotalColiforms <dbl>,
## # entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>
# View data in a spreadsheet-like viewer in RStudio
View(data)
# Data cleaning and transformation
data_cleaned <- data %>%
# Convert date to Date format
mutate(`dateCollected (dd/mm/yy)` = as.Date(`dateCollected (dd/mm/yy)`, format = "%d/%m/%y")) %>%
# Convert time to POSIXct for accurate time handling
mutate(`timeCollected (hh:mm)` = as.POSIXct(`timeCollected (hh:mm)`, format = "%H:%M")) %>%
# Convert bacteria counts to numeric for statistical analysis
mutate(across(c(`E.coli`, `TotalColiforms`, entericEnterococci, ESBL, VRE), as.numeric)) %>%
# Create a factor column for tide phases with correct levels
mutate(tide = factor(tide, levels = c("ebb", "flood", "slack", "NA")),
Waterway = as.factor(Waterway)) %>%
# Filter out rows where Waterway is NA
filter(!is.na(Waterway))
# Boxplot for Total Coliforms, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = `TotalColiforms`, fill = Waterway)) +
geom_boxplot() +
geom_jitter(aes(color = Waterway), width = 0.2) +
labs(
title = "Comparison of Total Coliforms by Tide and Waterway",
x = "Tide Condition",
y = "Total Coliforms (cfu/100ml)"
) +
theme_minimal()
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 227 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Plot for VRE levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = VRE, fill = Waterway)) +
geom_boxplot() +
geom_jitter(aes(color = Waterway), width = 0.2) +
labs(
title = "Comparison of VRE by Tide and Waterway",
x = "Tide Condition",
y = "VRE Levels"
) +
theme_minimal()
## Warning: Removed 273 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 273 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Plot for ESBL levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = ESBL, fill = Waterway)) +
geom_boxplot() +
geom_jitter(aes(color = Waterway), width = 0.2) +
labs(
title = "Comparison of ESBL by Tide and Waterway",
x = "Tide Condition",
y = "ESBL Levels"
) +
theme_minimal()
## Warning: Removed 275 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 275 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Plot for E. coli levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = `E.coli`, fill = Waterway)) +
geom_boxplot() +
geom_jitter(aes(color = Waterway), width = 0.2) +
labs(
title = "Comparison of E. coli Levels by Tide and Waterway",
x = "Tide Condition",
y = "E. coli (cfu/100ml)"
) +
theme_minimal()
## Warning: Removed 188 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 188 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Plot for Enterococci levels, grouped by tide and waterway
ggplot(data_cleaned, aes(x = tide, y = entericEnterococci, fill = Waterway)) +
geom_boxplot() +
geom_jitter(aes(color = Waterway), width = 0.2) +
labs(
title = "Comparison of Enteric Enterococci Levels by Tide and Waterway",
x = "Tide Condition",
y = "Enteric Enterococci (cfu/100ml)"
) +
theme_minimal()
## Warning: Removed 186 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 186 rows containing missing values or values outside the scale range
## (`geom_point()`).
#### Figures 1, 2, 3, 4, 5. This multi-figure set illustrates the
comparative concentrations of various bacterial contaminants, including
Total Coliforms, VRE (Vancomycin-Resistant Enterococci), ESBL (Extended
Spectrum Beta-Lactamase), E. coli, and Enteric Enterococci, across four
waterways (Blackwater, Colne, Roman, Stour) under different tide
conditions (ebb, flood, slack, and NA). All data are presented as
cfu/100ml (colony-forming units per 100 milliliters). The tidal phases
represent different stages of water movement that impact bacterial
concentrations, and the waterways are color-coded: Blackwater (red),
Colne (green), Roman (blue), Stour (purple), and NA (black) for
unspecified or unidentified waterways. The figures show distinct
patterns of contamination. Colne had the highest Total Coliforms during
flood tide, with lower concentrations in Blackwater and Stour, while
Roman showed minimal variability. VRE concentrations were highest in
Stour during ebb tide, followed by Colne and Blackwater during flood,
with Roman consistently lower. ESBL levels peaked in Blackwater during
flood tide, with moderate contamination in Colne and lower levels in
Roman and Stour. E. coli concentrations were highest in Stour during
slack tide, with moderate contamination in Colne and consistently lower
levels in Blackwater and Roman. Enteric Enterococci levels were elevated
in Blackwater during flood tide and in Stour during slack tide, while
Colne and Roman remained lower throughout. These results indicate that
Colne and Stour tend to exhibit higher bacterial levels across
categories, particularly during slack and flood tides. The presence of
antibiotic-resistant bacteria (VRE and ESBL) underscores the
environmental and public health concerns associated with these
waterways. This combined legend contextualizes the figures, linking
bacterial contamination to waterway and tide conditions.
# Filter out NA values for Waterway column
data_filtered <- data %>% filter(!is.na(Waterway))
# Plot E. coli levels by tide and waterway with thresholds and annotations using the filtered data
ggplot(data_filtered, aes(x = tide, y = `E.coli`, fill = Waterway)) +
geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
scale_y_log10() + # Logarithmic scale for better visualization
geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) + # Excellent threshold
geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) + # Good threshold
geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) + # Sufficient threshold
geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) + # Poor threshold
# Add text annotations for thresholds
annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
# Set labels and theme options
labs(title = "E. coli Levels by Tide Phase and Waterway",
y = "E. coli (cfu/100ml)",
x = "Tide Phase",
fill = "Waterway") +
scale_fill_manual(values = c("Blackwater" = "salmon",
"Colne" = "green3",
"Roman" = "purple",
"Stour" = "blue")) + # Removed NA from custom colors for waterways
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate tide labels for better visibility
plot.title = element_text(size = 16), # Adjust title size
axis.title = element_text(size = 14), # Adjust axis label size
legend.title = element_text(size = 12), # Adjust legend title size
legend.text = element_text(size = 10)) # Adjust legend text size
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 6. This figure shows the E. coli levels (cfu/100ml) across
different waterways (Blackwater, Colne, Roman, Stour, and NA) during
various tidal phases (ebb, flood, slack, and NA). The waterways are
represented by different colors: Blackwater (red), Colne (green), Roman
(purple), Stour (blue), and NA (grey). The graph uses a logarithmic
scale to better visualize the wide range of bacterial concentrations.
Dashed horizontal lines indicate water quality thresholds: Excellent
(green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange,
600 cfu/100ml), and Poor (red, 1000+ cfu/100ml). Stour shows notably
high E. coli levels during slack tide, while Colne has moderate levels.
Both Roman and Blackwater exhibit lower contamination across all tidal
phases.
# Filter out NA values for Waterway column
data_filtered <- data %>% filter(!is.na(Waterway))
# Updated ggplot code using the filtered data
ggplot(data_filtered, aes(x = tide, y = `TotalColiforms`, fill = Waterway)) +
geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
scale_y_log10() + # Logarithmic scale
geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) + # Excellent threshold
geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) + # Good threshold
geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) + # Sufficient threshold
geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) + # Poor threshold
annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
labs(title = "Total Coliforms Levels by Tide Phase and Waterway",
y = "Total Coliforms (cfu/100ml)",
x = "Tide Phase",
fill = "Waterway") +
scale_fill_manual(values = c("Blackwater" = "salmon",
"Colne" = "green3",
"Roman" = "purple",
"Stour" = "blue")) + # NA removed from the fill scale
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16),
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 233 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 7.This figure presents Total Coliform levels (cfu/100ml)
across different waterways (Blackwater, Colne, Roman, Stour, and NA)
during varying tidal phases (ebb, flood, slack, and NA). The waterways
are distinguished by colors: Blackwater (red), Colne (green), Roman
(purple), Stour (blue), and NA (grey). A logarithmic scale is used for
the vertical axis to accommodate a wide range of bacterial
concentrations. The dashed horizontal lines mark water quality
thresholds: Excellent (green, 250 cfu/100ml), Good (blue, 500
cfu/100ml), Sufficient (orange, 600 cfu/100ml), and Poor (red, 1000+
cfu/100ml). Colne consistently exhibits higher Total Coliform levels,
particularly during ebb tide, while other waterways, such as Blackwater
and Roman, display lower concentrations. Total Coliform levels generally
stay within or near the thresholds for Good or Sufficient quality across
most waterways and tide phases, except for a few outliers above the Poor
threshold.
# Load ggplot2 for plotting and dplyr for data manipulation
ggplot(data, aes(x = tide, y = ESBL, fill = Waterway)) +
# Create a boxplot with formatted outliers
geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
# Apply logarithmic scale to y-axis to manage data spread
scale_y_log10() +
# Add horizontal lines to show thresholds for water quality
geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) + # Excellent threshold
geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) + # Good threshold
geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) + # Sufficient threshold
geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) + # Poor threshold
# Annotate the threshold lines with text
annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
# Set plot labels for title, y-axis, and x-axis
labs(title = "ESBL Levels by Tide Phase and Waterway",
y = "ESBL (cfu/100ml)",
x = "Tide Phase",
fill = "Waterway") +
# Custom colors for each waterway
scale_fill_manual(values = c("Blackwater" = "salmon",
"Colne" = "green3",
"Roman" = "purple",
"Stour" = "blue",
"NA" = "grey")) +
# Apply minimal theme and rotate x-axis labels
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16), # Set plot title size
axis.title = element_text(size = 14), # Set axis title size
legend.title = element_text(size = 12), # Set legend title size
legend.text = element_text(size = 10)) # Set legend text size
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 8. This figure shows ESBL (Extended Spectrum Beta-Lactamase)
bacterial levels (cfu/100ml) across various waterways—Blackwater (red),
Colne (green), Roman (purple), and Stour (blue)—during different tide
phases (ebb, flood, slack, and NA). The chart uses a logarithmic scale
to illustrate a wide range of bacterial concentrations. Dashed
horizontal lines represent water quality thresholds: Excellent (green,
250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange, 600
cfu/100ml), and Poor (red, 1000+ cfu/100ml). ESBL levels are highest in
Blackwater during ebb tide, with Colne also showing elevated levels,
particularly during flood tide. Roman and Stour have lower ESBL
concentrations, often falling within the thresholds for Excellent or
Good quality.
ggplot(data, aes(x = tide, y = ESBL, fill = Waterway)) +
geom_boxplot(outlier.shape = 21, outlier.fill = "black", outlier.size = 2) +
scale_y_log10() + # Logarithmic scale
geom_hline(yintercept = 200, linetype = "dashed", color = "green", size = 1) + # Excellent threshold
geom_hline(yintercept = 400, linetype = "dashed", color = "blue", size = 1) + # Good threshold
geom_hline(yintercept = 600, linetype = "dashed", color = "orange", size = 1) + # Sufficient threshold
geom_hline(yintercept = 1000, linetype = "dashed", color = "red", size = 1) + # Poor threshold
annotate("text", x = 4.8, y = 200, label = "Excellent", color = "green", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 400, label = "Good", color = "blue", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 600, label = "Sufficient", color = "orange", hjust = 1, size = 5, fontface = "bold") +
annotate("text", x = 4.8, y = 1000, label = "Poor", color = "red", hjust = 1, size = 5, fontface = "bold") +
labs(title = "ESBL Levels by Tide Phase and Waterway",
y = "ESBL (cfu/100ml)",
x = "Tide Phase",
fill = "Waterway") +
scale_fill_manual(values = c("Blackwater" = "salmon",
"Colne" = "green3",
"Roman" = "purple",
"Stour" = "blue",
"NA" = "grey")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16),
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 9. This graph displays the levels of ESBL (Extended Spectrum
Beta-Lactamase) bacteria across different waterways (Blackwater, Colne,
Roman, and Stour) during various tidal phases (ebb, flood, slack, and
NA). The bacterial concentrations are measured in cfu/100ml. The
horizontal dashed lines represent water quality thresholds: Excellent
(green, 250 cfu/100ml), Good (blue, 500 cfu/100ml), Sufficient (orange,
600 cfu/100ml), and Poor (red, above 1000 cfu/100ml). ESBL levels are
highest in Blackwater during the ebb tide, with the flood tide in the
Colne waterway also showing elevated levels that approach the Poor
threshold. Roman and Stour waterways generally exhibit lower ESBL
levels.
# Convert date and time columns to appropriate types
data$`dateCollected (dd/mm/yy)` <- as.Date(data$`dateCollected (dd/mm/yy)`, format = "%d/%m/%y")
data$`timeCollected (hh:mm)` <- as.POSIXct(data$`timeCollected (hh:mm)`, format = "%H:%M")
# Extract and order months
data$month <- format(data$`dateCollected (dd/mm/yy)`, "%B")
data$month <- factor(data$month, levels = c("July", "August", "September", "October", "November", "December", "January", "February", "March", "April", "May", "June"))
# Convert bacteria columns to numeric
data <- data %>%
mutate(across(c(`E.coli`, `TotalColiforms`, entericEnterococci, ESBL, VRE), as.numeric))
# Remove unnecessary columns
data <- data %>%
select(-c(`TimeSinceHighTide (hh:mm)`, `PlusCode`, notes))
# Summary statistics for bacterial concentrations
summary_stats <- data %>%
summarise(
E.coli_mean = mean(`E.coli`, na.rm = TRUE),
entericEnterococci_mean = mean(entericEnterococci, na.rm = TRUE),
ESBL_mean = mean(ESBL, na.rm = TRUE),
VRE_mean = mean(VRE, na.rm = TRUE),
TotalColiforms_mean = mean(TotalColiforms, na.rm = TRUE),
ColiformsNonEC_mean = mean(ColiformsNonEC, na.rm = TRUE)
)
summary_stats
## # A tibble: 1 × 6
## E.coli_mean entericEnterococci_mean ESBL_mean VRE_mean TotalColiforms_mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 993. 618. 1213. 461. 783.
## # ℹ 1 more variable: ColiformsNonEC_mean <dbl>
# Average concentrations by Waterway
average_concentrations <- data %>%
group_by(Waterway) %>%
summarise(
Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
Avg_ESBL = mean(ESBL, na.rm = TRUE),
Avg_VRE = mean(VRE, na.rm = TRUE)
)
# Display the result
average_concentrations
## # A tibble: 6 × 5
## Waterway Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 "" NaN NaN NaN NaN
## 2 "Blackwater" 44.3 634 764 236
## 3 "Colne" 585. 394. 1635. 384.
## 4 "Roman" 1072 496 1088 80
## 5 "Stour" 3218. 1575. 21.3 1052
## 6 <NA> 42 81 NaN NaN
# Define thresholds
ec_thresholds <- c(Excellent = 250, Good = 500, Sufficient = 500, Poor = Inf)
enterococci_thresholds <- c(Excellent = 100, Good = 200, Sufficient = 185, Poor = Inf)
# Compare sites against thresholds
data <- data %>%
mutate(
E.coli_quality = case_when(
`E.coli` <= ec_thresholds["Excellent"] ~ "Excellent",
`E.coli` <= ec_thresholds["Good"] ~ "Good",
`E.coli` <= ec_thresholds["Sufficient"] ~ "Sufficient",
TRUE ~ "Poor"
),
entericEnterococci_quality = case_when(
entericEnterococci <= enterococci_thresholds["Excellent"] ~ "Excellent",
entericEnterococci <= enterococci_thresholds["Good"] ~ "Good",
entericEnterococci <= enterococci_thresholds["Sufficient"] ~ "Sufficient",
TRUE ~ "Poor"
)
)
# Display the updated dataset with water quality categories
data
## # A tibble: 297 × 22
## sampleID siteName siteID `dateCollected (dd/mm/yy)` Waterway
## <chr> <chr> <chr> <date> <chr>
## 1 AA01 CampusLake WP_1 2023-07-05 <NA>
## 2 AA02 Wivenhoe tidal barrier Co_1 2023-07-25 Colne
## 3 AA03 Wivenhoe tidal barrier Co_1 2023-07-25 Colne
## 4 AA04 Wivenhoe tidal barrier Co_1 2023-07-25 Colne
## 5 AA05 Alresford creek Co_2 2023-07-25 Colne
## 6 AA06 Alresford creek Co_2 2023-07-25 Colne
## 7 AA07 Alresford creek Co_2 2023-07-25 Colne
## 8 AA08 buoy 17 Co_4 2023-07-25 Colne
## 9 AA09 buoy 17 Co_4 2023-07-25 Colne
## 10 AA10 buoy 17 Co_4 2023-07-25 Colne
## # ℹ 287 more rows
## # ℹ 17 more variables: `timeCollected (hh:mm)` <dttm>, tide <chr>,
## # collector <chr>, `latitude (xx.xxxxxx)` <dbl>,
## # `longitude (x.xxxxxx)` <dbl>, waterSample <dbl>, sedimentSample <dbl>,
## # daysSinceRain <dbl>, ColiformsNonEC <dbl>, E.coli <dbl>,
## # TotalColiforms <dbl>, entericEnterococci <dbl>, ESBL <dbl>, VRE <dbl>,
## # month <fct>, E.coli_quality <chr>, entericEnterococci_quality <chr>
View(data)
quality_counts <- data %>%
group_by(month, E.coli_quality) %>%
summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
quality_counts_filtered <- quality_counts %>%
filter(!is.na(month) & !is.na(E.coli_quality))
# E. coli Quality Counts by Month
ggplot(quality_counts_filtered, aes(x = month, y = count, fill = E.coli_quality)) +
geom_col(position = "dodge") +
labs(title = "E. coli Quality Classification Counts by Month", y = "Count", x = "Month") +
scale_fill_manual(values = c("Excellent" = "lightblue", "Good" = "blue", "Sufficient" = "#D2B48C", "Poor" = "brown"))
quality_counts <- data %>%
group_by(month, entericEnterococci_quality) %>%
summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
quality_counts_filtered <- quality_counts %>%
filter(!is.na(month) & !is.na(entericEnterococci_quality))
# Enterococci Quality Counts by Month
ggplot(quality_counts_filtered, aes(x = month, y = count, fill = entericEnterococci_quality)) +
geom_col(position = "dodge") +
labs(title = "Enterococci Quality Classification Counts by Month", y = "Count", x = "Month") +
scale_fill_manual(values = c("Excellent" = "lightblue", "Good" = "blue", "Sufficient" = "#D2B48C", "Poor" = "brown"))
#### Figures 10 & 11. These two figures present the monthly
distribution of water quality classifications for E. coli and
Enterococci, based on contamination thresholds of Excellent (light
blue), Good (blue), and Poor (red). In both figures, August exhibits the
highest number of Excellent quality counts, while Poor quality levels
are consistently observed across July, September, and October. Good
quality counts remain relatively low for both bacteria, with slight
increases observed in September and October. The figures highlight that
August stands out for its superior water quality in terms of E. coli and
Enterococci, while Poor classifications are spread more evenly across
the other months, especially during mid-summer and early fall.
# Summary of findings
cat("Summary of E.coli concentrations across sites:")
## Summary of E.coli concentrations across sites:
summary(data$`E.coli`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 8.0 86.0 993.3 568.0 22160.0 188
cat("Summary of Enterococci concentrations across sites:")
## Summary of Enterococci concentrations across sites:
summary(data$entericEnterococci)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 4.0 82.0 618.2 395.0 9760.0 186
# Average concentrations by Waterway
average_concentrations <- data %>%
group_by(Waterway) %>%
summarise(
Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
Avg_ESBL = mean(ESBL, na.rm = TRUE),
Avg_VRE = mean(VRE, na.rm = TRUE)
)
average_concentrations
## # A tibble: 6 × 5
## Waterway Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 "" NaN NaN NaN NaN
## 2 "Blackwater" 44.3 634 764 236
## 3 "Colne" 585. 394. 1635. 384.
## 4 "Roman" 1072 496 1088 80
## 5 "Stour" 3218. 1575. 21.3 1052
## 6 <NA> 42 81 NaN NaN
# Summary table of average concentrations by waterway and month
average_concentrations_by_waterway_month <- data %>%
group_by(Waterway, month) %>%
summarise(
Avg_E.coli = mean(`E.coli`, na.rm = TRUE),
Avg_Enterococci = mean(entericEnterococci, na.rm = TRUE),
Avg_ESBL = mean(ESBL, na.rm = TRUE),
Avg_VRE = mean(VRE, na.rm = TRUE)
)
## `summarise()` has grouped output by 'Waterway'. You can override using the
## `.groups` argument.
average_concentrations_by_waterway_month
## # A tibble: 12 × 6
## # Groups: Waterway [6]
## Waterway month Avg_E.coli Avg_Enterococci Avg_ESBL Avg_VRE
## <chr> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 "" <NA> NaN NaN NaN NaN
## 2 "Blackwater" August 31.3 16 NaN NaN
## 3 "Blackwater" September 96 2488 764 236
## 4 "Colne" July 1094. 980 NaN NaN
## 5 "Colne" August 298. 101 NaN NaN
## 6 "Colne" September 718. 332. 1635. 384.
## 7 "Colne" October 65.6 24 NaN NaN
## 8 "Roman" October 1072 496 1088 80
## 9 "Stour" August 4204. 1971. NaN NaN
## 10 "Stour" September 771. 876 NaN NaN
## 11 "Stour" October 3564 736 21.3 1052
## 12 <NA> July 42 81 NaN NaN
# Define thresholds for inland bathing water standards
inland_thresholds <- list(
E.coli = c(Excellent = 500, Good = 1000, Sufficient = 900, Poor = Inf),
enterococci = c(Excellent = 200, Good = 400, Sufficient = 330, Poor = Inf)
)
# Replace "NC" with NA in character columns only
data <- data %>%
mutate(across(where(is.character), ~na_if(., "NC")))
# Filter out rows with NA values for Waterway (or any other relevant column)
data_filtered <- data %>% filter(!is.na(Waterway))
# Function to plot with inland thresholds and log scale
plot_with_thresholds_log <- function(data, y_var, thresholds, title, y_label) {
ggplot(data, aes(x = month, y = !!sym(y_var), fill = Waterway)) +
geom_boxplot() +
geom_hline(yintercept = thresholds["Excellent"], linetype = "dashed", color = "green", linewidth = 1) +
geom_hline(yintercept = thresholds["Good"], linetype = "dashed", color = "blue", linewidth = 1) +
geom_hline(yintercept = thresholds["Sufficient"], linetype = "dashed", color = "orange", linewidth = 1) +
geom_hline(yintercept = max(data[[y_var]], na.rm = TRUE), linetype = "dotted", color = "red", linewidth = 1) + # Poor line
scale_y_log10() + # Apply log scale to y-axis
labs(title = title, y = y_label, x = "Month") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
annotate("text", x = Inf, y = thresholds["Excellent"], label = "Excellent", color = "green", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = thresholds["Good"], label = "Good", color = "blue", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = thresholds["Sufficient"], label = "Sufficient", color = "orange", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = max(data[[y_var]], na.rm = TRUE), label = "Poor", color = "red", hjust = 1.1, vjust = 2)
}
# E. coli plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "E.coli", inland_thresholds$`E.coli`, "E. coli Levels by Waterway and Month", "E. coli (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Enterococci plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "entericEnterococci", inland_thresholds$enterococci, "Enterococci Levels by Waterway and Month", "Enterococci (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 211 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Plot for resistant bacteria (ESBL and VRE) with Site ID (filtered data)
ggplot(data_filtered, aes(x = month, fill = Waterway)) +
geom_boxplot(aes(y = ESBL, fill = "ESBL"), color = "darkred") +
geom_boxplot(aes(y = VRE, fill = "VRE"), color = "darkblue", alpha = 0.5) +
scale_y_log10() + # Apply log scale to y-axis
labs(title = "Resistant Bacteria (ESBL & VRE) Levels by Waterway and Month", y = "Concentration (cfu/100ml)", fill = "Bacteria Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 280 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 12, 13, 14. Figures 12, 13, and 14 compare bacterial
contamination levels across different waterways (Blackwater, Colne,
Roman, Stour) by month, highlighting trends for E. coli, Enterococci,
and antibiotic-resistant bacteria (ESBL and VRE). In Figure 12, E. coli
levels show varying trends, with peaks observed in different waterways
across July to October, and thresholds for Excellent, Good, Sufficient,
and Poor water quality are marked on the graph. Similarly, Figure 13
presents Enterococci levels by month and waterway, with significant
variability between months and waterways, with Blackwater showing
elevated levels in July, and Colne in September. Threshold lines for
water quality classification are also indicated. Figure 14 focuses on
antibiotic-resistant bacteria, comparing ESBL and VRE concentrations
across months. While both resistant bacteria types are present, VRE
shows a higher concentration, particularly in September.
# Define thresholds for inland bathing water standards
inland_thresholds <- list(
E.coli = c(Excellent = 500, Good = 1000, Sufficient = 900, Poor = Inf),
enterococci = c(Excellent = 200, Good = 400, Sufficient = 330, Poor = Inf)
)
# Replace "NC" with NA in character columns only
data <- data %>%
mutate(across(where(is.character), ~na_if(., "NC")))
# Filter out rows with NA values for Waterway (or any other relevant column)
data_filtered <- data %>% filter(!is.na(Waterway))
# Function to plot with inland thresholds and log scale
plot_with_thresholds_log <- function(data, y_var, thresholds, title, y_label) {
ggplot(data, aes(x = siteID, y = !!sym(y_var), fill = Waterway)) +
geom_boxplot() +
geom_hline(yintercept = thresholds["Excellent"], linetype = "dashed", color = "green", linewidth = 1) +
geom_hline(yintercept = thresholds["Good"], linetype = "dashed", color = "blue", linewidth = 1) +
geom_hline(yintercept = thresholds["Sufficient"], linetype = "dashed", color = "orange", linewidth = 1) +
geom_hline(yintercept = max(data[[y_var]], na.rm = TRUE), linetype = "dotted", color = "red", linewidth = 1) + # Poor line
scale_y_log10() + # Apply log scale to y-axis
labs(title = title, y = y_label, x = "Site ID") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
annotate("text", x = Inf, y = thresholds["Excellent"], label = "Excellent", color = "green", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = thresholds["Good"], label = "Good", color = "blue", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = thresholds["Sufficient"], label = "Sufficient", color = "orange", hjust = 1.1, vjust = -0.5) +
annotate("text", x = Inf, y = max(data[[y_var]], na.rm = TRUE), label = "Poor", color = "red", hjust = 1.1, vjust = 2)
}
# E. coli plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "E.coli", inland_thresholds$`E.coli`, "E. coli Levels by Site ID", "E. coli (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Enterococci plot with inland thresholds (filtered data)
plot_with_thresholds_log(data_filtered, "entericEnterococci", inland_thresholds$enterococci, "Enterococci Levels by Site ID", "Enterococci (cfu/100ml)")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 211 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Plot for resistant bacteria (ESBL and VRE) with Site ID (filtered data)
ggplot(data_filtered, aes(x = siteID, fill = Waterway)) +
geom_boxplot(aes(y = ESBL, fill = "ESBL"), color = "darkred") +
geom_boxplot(aes(y = VRE, fill = "VRE"), color = "darkblue", alpha = 0.5) +
scale_y_log10() + # Apply log scale to y-axis
labs(title = "Resistant Bacteria (ESBL & VRE) Levels by Site ID", y = "Concentration (cfu/100ml)", fill = "Bacteria Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 280 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 15,16,17. In Figures 15, 16, and 17, we analyze the
concentrations of E. coli, Enterococci, and resistant bacteria (ESBL and
VRE) across various site IDs for the Blackwater, Colne, Roman, and Stour
waterways. Each figure visualizes bacterial concentration data in
cfu/100ml using box plots for individual site IDs, overlaid with water
quality thresholds: Excellent, Good, Sufficient, and Poor. In the E.
coli figure, high concentrations are observed in sites like Stour (S1,
S3) and Colne (C9), surpassing the Poor quality threshold, while most
other sites fall within the Sufficient or Good thresholds. Enterococci
concentrations follow a similar pattern, with several sites exceeding
Poor thresholds, particularly in Stour and Colne, indicating substantial
contamination at certain points. Resistant bacteria (ESBL and VRE) show
a diverse distribution, with ESBL concentrations significantly higher at
some sites (e.g., Hy1 and Hy2) while VRE levels are more sporadic,
indicating variability in antibiotic-resistant bacterial presence across
sites.
# Manually input the data for each sample
data <- data.frame(
Date = factor(c("25-07-2023", "25-07-2023", "25-07-2023", "10-08-2023", "10-08-2023", "10-08-2023",
"11-10-2023", "11-10-2023", "08-12-2023", "01-03-2024", "27-04-2024"),
levels = c("25-07-2023", "10-08-2023", "11-10-2023", "08-12-2023", "01-03-2024", "27-04-2024")),
Enterococci = c(432, 688, 540, 40, 160, 40, 31, 22, NA, 290, 20),
E_coli = c(820, 1016, 728, 40, 160, 40, 52, 304, 3280, 480, 1410)
)
# Melt the data to long format for ggplot
library(reshape2)
data_long <- melt(data, id.vars = "Date", variable.name = "Bacteria", value.name = "Count")
# Create the boxplot
ggplot(data_long, aes(x = Date, y = Count)) +
geom_boxplot(aes(fill = Bacteria)) +
geom_hline(yintercept = 200, linetype = "dashed", color = "green") + # Example threshold for "Excellent"
geom_hline(yintercept = 400, linetype = "dashed", color = "blue") + # Example threshold for "Good"
geom_hline(yintercept = 600, linetype = "dashed", color = "orange") + # Example threshold for "Sufficient"
geom_hline(yintercept = 1000, linetype = "dashed", color = "red") + # Poor threshold
labs(title = "Bacterial Levels by Date (Site Co_1)", y = "Bacteria Count (cfu/100ml)", x = "Date Collected") +
facet_wrap(~Bacteria, scales = "free_y") + # Facet by Bacteria type
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
#### Figure 18. This figure illustrates the bacterial contamination
levels of Enterococci and E. coli at Site Co_1 over various sampling
dates, with the x-axis displaying the collection dates (from July 2023
to April 2024) and the y-axis representing bacterial counts in
cfu/100ml. The left panel shows Enterococci levels in red boxplots,
while the right panel presents E. coli levels in teal boxplots. Both
panels are overlaid with dashed lines indicating water quality
thresholds: green for Excellent, blue for Good, orange for Sufficient,
and red for Poor. The highest Enterococci concentrations were observed
on 25-07-2023, exceeding the Sufficient threshold, followed by a
significant drop on 19-08-2023, with consistently low levels afterward.
For E. coli, the highest concentration occurred on 10-08-2023, exceeding
the Good threshold, followed by a decrease below the Excellent threshold
by 08-12-2023, and a subsequent spike on 27-04-2024 approaching the Poor
threshold. The graph effectively visualizes changes in bacterial
contamination and fluctuations in water quality at Site Co_1 over
time.
# Extract relevant columns and convert date to month
data <- data.frame(
Date = as.Date(c("2023-10-11", "2023-10-11", "2023-07-25", "2023-07-25", "2023-07-25",
"2023-08-10", "2023-08-10", "2023-08-10", "2023-10-11", "2023-10-11",
"2023-12-08", "2024-03-01", "2024-04-27", "2024-06-06", "2024-07-05"),
format = "%Y-%m-%d"),
Enterococci = c(19, 9, 432, 688, 540, 0, 0, 0, 31, 22, 3280, 290, 20, 560, 200),
E_coli = c(84, 34, 820, 1016, 728, 0, 0, 0, 52, 304, 0, 480, 2820, 720, 720)
)
# Convert Date to Month
data$Month <- format(data$Date, "%B")
# Melt the data to long format for ggplot
data_long <- melt(data, id.vars = "Month", variable.name = "Bacteria", value.name = "Count")
## Warning: attributes are not identical across measure variables; they will be
## dropped
# Reorder the levels of the Month factor
data_long$Month <- factor(data_long$Month, levels = month.name)
# Create the boxplot
ggplot(data_long, aes(x = Month, y = Count)) +
geom_boxplot(aes(fill = Bacteria)) +
geom_hline(yintercept = 200, linetype = "dashed", color = "green") + # Example threshold for "Excellent"
geom_hline(yintercept = 400, linetype = "dashed", color = "blue") + # Example threshold for "Good"
geom_hline(yintercept = 600, linetype = "dashed", color = "orange") + # Example threshold for "Sufficient"
geom_hline(yintercept = 1000, linetype = "dashed", color = "red") + # Poor threshold
labs(title = "Bacterial Levels by Month (Site Specific)", y = "Bacteria Count (cfu/100ml)", x = "Month") +
facet_wrap(~Bacteria, scales = "free_y") + # Facet by Bacteria type
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#### Figure 19. This figure presents bacterial levels of Enterococci and
E. coli over different months at a specific site. The x-axis represents
months from March to December, and the y-axis represents the bacterial
count in cfu/100ml. The graph consists of three panels: the first for
Date (shown in black), the second for Enterococci (green boxplots), and
the third for E. coli (blue boxplots). The dashed lines correspond to
water quality thresholds: green for Excellent, blue for Good, orange for
Sufficient, and red for Poor. The Date panel shows constant values, with
all counts around 20,000 cfu/100ml. The Enterococci panel shows
concentrations peaking in July and remaining above the Poor threshold by
December, while concentrations in other months remain below the
Excellent threshold. Similarly, E. coli levels peak in July and
gradually decrease by December, with some values exceeding the Poor
threshold in August.