Author

Jen Richmond

Published

April 15, 2025

data cleaning

There seems to be 3 pockets of e-coli data in this dataset. To allow for comparison of max historical values across these periods, I am coding pre-2010 data as “really old”, pre-2010 data as “old” and since 2020 as recent.

Code
# load packages
library(tidyverse)
library(here)
library(gt)
library(ggeasy)

# read data
ecoli <- read_csv(here("data", "ecoli.csv")) %>%
  mutate(date = mdy(date)) %>%
  mutate(month = month(date, label=TRUE)) %>%
  mutate(year_category = case_when(year(date) < 2010 ~ "really_old",
                                     year(date) > 2020 ~ "recent", 
                                    year(date) < 2020 & year(date) >= 2010 ~ "old")) 


ecoli$year_category <- fct_relevel(ecoli$year_category, c("really_old", "old", "recent"))



glimpse(ecoli)
Rows: 223
Columns: 4
$ date          <date> 2001-01-10, 2001-02-09, 2001-03-02, 2001-03-15, 2001-04…
$ concentration <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ month         <ord> Jan, Feb, Mar, Mar, Apr, May, Jun, Jul, Aug, Sep, Nov, D…
$ year_category <fct> really_old, really_old, really_old, really_old, really_o…

descriptives

Here I am interested in the maximum e-coli values in the each time period. The maximum value in the dataset is 130000! (date = 11/8/2015)

Code
ecoli %>% 
  group_by(year_category) %>%
  summarise(max = max(concentration, na.rm=TRUE), 
            min = min(concentration, na.rm = TRUE)) %>%
  gt::gt()
year_category max min
really_old 14000 10
old 130000 2
recent 1300 1
Code
ecoli %>%
  filter(concentration == 130000)
# A tibble: 1 × 4
  date       concentration month year_category
  <date>             <dbl> <ord> <fct>        
1 2015-08-11        130000 Aug   old          

plot

Plotting by time period to allow for historical comparison.

Code
options(scipen = 999)

ecoli %>%
  ggplot(aes(x = date, y = concentration)) +
  geom_point() +
  facet_wrap(~year_category) +
  theme_minimal() +
  labs(y = "E-coli level", x = "Date") +
 easy_text_size(c("axis.text.x", "axis.text.y"), 12) +
  easy_text_size(c("axis.title.x", "axis.title.y"), 14) +
  easy_text_size(c("plot.title", "plot.subtitle"), 16)

Plotting each time period separately to allow for more detailed analysis.

Code
ecoli %>%
  filter(year_category == "really_old") %>%
  ggplot(aes(x = date, y = concentration)) +
  geom_point() +
  theme_minimal() + 
  labs(title = "E-coli cfu/100ml", subtitle = "2000-2010", 
       y = "E-coli level", x = "Date", caption = "Ecoli max = 14000") +
 easy_text_size(c("axis.text.x", "axis.text.y"), 12) +
  easy_text_size(c("axis.title.x", "axis.title.y"), 14) +
  easy_text_size(c("plot.title", "plot.subtitle"), 16)

Code
ecoli %>%
  filter(year_category == "old") %>%
  ggplot(aes(x = date, y = concentration)) +
  geom_point() +
  theme_minimal() + 
  labs(title = "E-coli cfu/100ml", subtitle = "2010-2020", 
       y = "E-coli level", x = "Date", caption = "Ecoli max = 130000") +
 easy_text_size(c("axis.text.x", "axis.text.y"), 12) +
  easy_text_size(c("axis.title.x", "axis.title.y"), 14) +
  easy_text_size(c("plot.title", "plot.subtitle"), 16)

Code
ecoli %>%
  filter(year_category == "recent") %>%
  ggplot(aes(x = date, y = concentration)) +
  geom_point() +
  theme_minimal() + 
  labs(title = "E-coli cfu/100ml", subtitle = "2020-current", 
       y = "E-coli level", x = "Date", caption = "Ecoli max = 1300") +
 easy_text_size(c("axis.text.x", "axis.text.y"), 12) +
  easy_text_size(c("axis.title.x", "axis.title.y"), 14) +
  easy_text_size(c("plot.title", "plot.subtitle"), 16)