# Load necessary libraries
pacman::p_load(pacman, ggplot2, dplyr, lubridate, sf, gridExtra, rnaturalearth, rnaturalearthdata)

# Load the data
data <- read.csv("full_scottish_sewage_spills.csv")

# Convert date columns to Date-Time format
data$Overflow.Event.Start.Time <- as.POSIXct(data$Overflow.Event.Start.Time, format="%Y-%m-%d %H:%M:%S", tz="UTC")
data$Overflow.Event.End.Time <- as.POSIXct(data$Overflow.Event.End.Time, format="%Y-%m-%d %H:%M:%S", tz="UTC")

# Load Scotland map using rnaturalearth and filter to just the UK
world <- ne_countries(scale = "medium", returnclass = "sf")
scotland_map <- world %>% filter(admin == "United Kingdom")

# Filter out rows with missing or invalid coordinates
data <- data %>%
  filter(!is.na(Longitude), !is.na(Latitude), is.finite(Longitude), is.finite(Latitude))

# Convert event locations to an sf object after filtering for valid coordinates
data_sf <- st_as_sf(data, coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE)

# Define a custom colour palette for Source Type
source_colors <- c(
  "Combined Sewage Overflow" = "#00ff00",    
  "WWTW" = "#990000",                        
  "Storm Overflow" = "#000000",              
  "Unknown Source Type" = "#ff33cc"          
)

# Convert to month and handle NA values by assigning them as "Unknown"
data$Month <- month(data$Overflow.Event.Start.Time, label = TRUE, abbr = TRUE)
data$Month[is.na(data$Month)] <- "Unknown"  
## Warning in `[<-.factor`(`*tmp*`, is.na(data$Month), value = structure(c(NA, :
## invalid factor level, NA generated
# Convert month to a factor with the correct order including "Unknown"
data$Month <- factor(data$Month, levels = c(month.abb, "Unknown"))

data$Season <- case_when(
  month(data$Overflow.Event.Start.Time) %in% c(12, 1, 2) ~ "Winter",
  month(data$Overflow.Event.Start.Time) %in% c(3, 4, 5) ~ "Spring",
  month(data$Overflow.Event.Start.Time) %in% c(6, 7, 8) ~ "Summer",
  month(data$Overflow.Event.Start.Time) %in% c(9, 10, 11) ~ "Autumn"
)

data_filtered <- data %>%
  filter(!is.na(Season)) 


# 1. Volume discharged by month, grouped by Source Type
p1 <- ggplot(data, aes(x = Month, y = Volume.Discharged, color = Source.Type, fill = Source.Type)) +
  geom_boxplot() +
  scale_y_continuous(labels = scales::comma) +  
  scale_fill_manual(values = source_colors) +   
  scale_color_manual(values = source_colors) +  
  labs(title = "Monthly Volume Discharged by Source Type", x = "Month", y = "Volume Discharged (litres)") +
  theme(legend.position = "bottom")

# 2. Frequency of overflow events by area, grouped by Source Type
p2 <- data %>%
  count(Area, Source.Type) %>%
  ggplot(aes(x = reorder(Area, n), y = n, fill = Source.Type)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = source_colors) +   
  labs(title = "Frequency of Overflow Events by Area and Source Type", x = "Area", y = "Number of Events") +
  theme(legend.position = "bottom")

# 3. Discharge volume by event duration, grouped by Source Type
p3 <- ggplot(data, aes(x = Duration.Hours, y = Volume.Discharged, color = Source.Type)) +
  geom_point(alpha = 0.6) +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values = source_colors) +   
  labs(title = "Volume Discharged by Duration of Overflow Event by Source Type", x = "Duration (Hours)", y = "Volume Discharged (litres)") +
  theme(legend.position = "bottom")

# 4. Event frequency by season, grouped by Source Type
p4 <- ggplot(data_filtered, aes(x = Season, fill = Source.Type)) +
  geom_bar(position = "dodge") +
  scale_y_continuous(labels = scales::comma) +  
  scale_fill_manual(values = source_colors) +   
  labs(title = "Overflow Events by Season and Source Type", x = "Season", y = "Number of Events") +
  theme(legend.position = "bottom")

# Combine the first four plots
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
## Warning: Removed 184 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 5. Map of overflow events in Scotland by Source Type 
ggplot() +
  geom_sf(data = scotland_map, fill = "#cccccc", color = "#000000") +
  geom_sf(data = data_sf, aes(color = Source.Type, size = Volume.Discharged), alpha = 0.7) +
  scale_color_manual(values = source_colors) +   
  labs(title = "Map of Sewage Overflow Events in Scotland by Source Type", 
       x = "Longitude", y = "Latitude", color = "Source Type", size = "Volume Discharged (litres)") +
  theme(legend.position = "bottom")