Analysis of maximum annual flood events in the Monongahela River watershed.

Load Required Libraries

library(tidyverse)
library(ncdf4)
library(CFtime)
library(ggpmisc)  # to display trendline equation
library(scales)   # plot formatting
library(ggridges) # ridge plots

Load and Prepare Data

# Load stream gage information
stream_gages <- read_csv("huc4_0502_stream_gages_20yr_plus.csv") |>
  mutate(
    site_no    = as.factor(site_no),
    station_nm = as.factor(station_nm),
    count_nu   = as.integer(count_nu)
  ) |>
  select(site_no, station_nm, dec_lat_va, dec_long_va, drain_area_va)

head(stream_gages)

## # A tibble: 6 × 5
##   site_no  station_nm                       dec_lat_va dec_long_va drain_area_va
##   <fct>    <fct>                                 <dbl>       <dbl>         <dbl>
## 1 03050000 TYGART VALLEY RIVER NEAR DAILEY…       38.8       -79.9         188  
## 2 03050500 TYGART VALLEY RIVER NEAR ELKINS…       38.9       -79.9         273  
## 3 03051000 TYGART VALLEY RIVER AT BELINGTO…       39.0       -79.9         415  
## 4 03051500 MIDDLE FORK RIVER AT MIDVALE, WV       38.9       -80.1         123  
## 5 03052000 MIDDLE FORK RIVER AT AUDRA, WV         39.0       -80.1         149  
## 6 03052500 SAND RUN NEAR BUCKHANNON, WV           39.0       -80.2          14.3

# Load maximum annual floods
max_floods <- read_csv("max_annual_floods.csv") |>
  mutate(
    site_no             = as.factor(site_no),
    station_name        = as.factor(station_name),
    year                = as.integer(year),
    max_flood_discharge = as.integer(max_flood_discharge),
    max_flood_month     = as.integer(max_flood_month),
    max_flood_day       = as.integer(max_flood_day),
    max_flood_date      = as.Date(max_flood_date),
    season              = factor(season, levels = c("Winter", "Spring", "Summer", "Autumn"))
  )

head(max_floods)

## # A tibble: 6 × 8
##   site_no  station_name  year max_flood_date max_flood_discharge max_flood_month
##   <fct>    <fct>        <int> <date>                       <int>           <int>
## 1 03050000 TYGART VALL…  1948 1948-02-14                    7640               2
## 2 03050000 TYGART VALL…  1950 1950-06-25                    6240               6
## 3 03050000 TYGART VALL…  1952 1952-01-28                    5100               1
## 4 03050000 TYGART VALL…  1953 1953-02-22                    4320               2
## 5 03050000 TYGART VALL…  1954 1954-10-16                   10400              10
## 6 03050000 TYGART VALL…  1956 1956-05-28                    7250               5
## # ℹ 2 more variables: max_flood_day <int>, season <fct>

# Join datasets to get coordinates for each flood event
floods_with_coords <- max_floods |>
  left_join(stream_gages, by = "site_no") |>
  filter(!is.na(dec_lat_va), !is.na(dec_long_va)) |>
  arrange(year, site_no)

cat("Total flood events to process:", nrow(floods_with_coords), "\n")

## Total flood events to process: 2416

cat("Year range:", min(floods_with_coords$year), "to", max(floods_with_coords$year), "\n")

## Year range: 1948 to 2024

head(floods_with_coords)

## # A tibble: 6 × 12
##   site_no  station_name  year max_flood_date max_flood_discharge max_flood_month
##   <fct>    <fct>        <int> <date>                       <int>           <int>
## 1 03050000 TYGART VALL…  1948 1948-02-14                    7640               2
## 2 03050500 TYGART VALL…  1948 1948-02-14                    8810               2
## 3 03051000 TYGART VALL…  1948 1948-02-14                   13200               2
## 4 03052000 MIDDLE FORK…  1948 1948-02-14                    8470               2
## 5 03052500 SAND RUN NE…  1948 1948-03-24                    1320               3
## 6 03053500 BUCKHANNON …  1948 1948-02-14                   10300               2
## # ℹ 6 more variables: max_flood_day <int>, season <fct>, station_nm <fct>,
## #   dec_lat_va <dbl>, dec_long_va <dbl>, drain_area_va <dbl>

Extract Precipitation Data by Year

# Directory containing NetCDF files
data_directory <- "noaa_precip_data"


#' Extract precipitation data for flood events in a single year
#' 
#' This function processes all flood events for a given year by opening the 
#' corresponding NetCDF precipitation file once and extracting both single-day 
#' and 3-day precipitation totals for each flood event. The 3-day total includes 
#' precipitation from the flood day and the two preceding days.
#' 
#' @param year_data A data frame containing flood events for a single year.
#'   Must include columns: `max_flood_date`, `dec_lat_va`, `dec_long_va`
#' @param year Integer. The year being processed (used for NetCDF filename construction)
#' @param data_dir Character. Directory path containing NetCDF files. 
#'   Default is "noaa_precip_data"
#' 
#' @return A data frame identical to `year_data` but with two additional columns:
#'   \describe{
#'     \item{precipitation_mm}{Numeric. Single-day precipitation on flood date}
#'     \item{precipitation3_mm}{Numeric. 3-day precipitation total (flood day + 2 prior days)}
#'   }
#'   Both columns will contain NA values for failed extractions or invalid dates.
#' 
#' @details 
#' The function expects NetCDF files following the naming convention: 
#' `precip.V1.0.YYYY.nc` where YYYY is the year.
#' 
#' **Coordinate Handling:**
#' - Longitude coordinates are converted from 0-360° to -180-180° format
#' - Nearest grid cell matching is used for spatial extraction
#' 
#' **Temporal Handling:**
#' - Uses CFtime package for proper time coordinate interpretation
#' - 3-day precipitation window: [flood_date - 2 days, flood_date]
#' 
#' **Limitations:**
#' - Flood events on January 1-2 receive NA for `precipitation3_mm` to avoid 
#'   cross-year file complications
#' - Missing dates in NetCDF files result in NA values
#' - File I/O errors are caught and logged as warnings
#' 
#' @examples
#' \dontrun{
#' # Example year_data structure:
#' floods_2020 <- data.frame(
#'   max_flood_date = as.Date(c("2020-03-15", "2020-07-22")),
#'   dec_lat_va = c(38.8, 39.1),
#'   dec_long_va = c(-79.9, -80.1)
#' )
#' 
#' # Extract precipitation data
#' result <- extract_year_precipitation(floods_2020, 2020, "path/to/netcdf/files")
#' }
#' 
extract_year_precipitation <- function(year_data, year, data_dir = data_directory) {
  
  cat("Processing year", year, "with", nrow(year_data), "flood events...\n")
  
  # Construct NetCDF filename based on your naming convention
  nc_file <- file.path(data_dir, paste0("precip.V1.0.", year, ".nc"))
  
  # Check if file exists
  if (!file.exists(nc_file)) {
    warning("NetCDF file not found for year ", year, ": ", nc_file)
    return(year_data |> 
           mutate(precipitation_mm = NA_real_, 
                  precipitation3_mm = NA_real_))
  }
  
  tryCatch({
    # Open NetCDF file
    nc <- nc_open(nc_file)
    
    # Get coordinates and convert longitude
    lon_raw  <- ncvar_get(nc, "lon")
    lon_grid <- lon_raw - 360  # Convert to western longitude
    lat_grid <- ncvar_get(nc, "lat")
    
    # Handle time using CFtime
    time_dim   <- nc$dim$time
    cf_time    <- CFtime(time_dim$units, time_dim$calendar, time_dim$vals)
    timestamps <- as_timestamp(cf_time)
    file_dates <- as.Date(timestamps)
    
    # Get precipitation variable name (first variable)
    precip_var_name <- names(nc$var)[1]
    
    # Extract precipitation for each flood event
    precip_results <- map_dfr(1:nrow(year_data), function(i) {
      
      flood_date <- year_data$max_flood_date[i]
      
      # Skip events on Jan 1-2 to avoid cross-year complications
      if (month(flood_date) == 1 && day(flood_date) <= 2) {
        return(tibble(precipitation_mm = NA_real_, precipitation3_mm = NA_real_))
      }
      
      # Find nearest grid cell for coordinates
      lon_idx <- which.min(abs(lon_grid - year_data$dec_long_va[i]))
      lat_idx <- which.min(abs(lat_grid - year_data$dec_lat_va[i]))
      
      # Define 3-day window (flood day and 2 days prior)
      date_window <- seq(from = flood_date - days(2), to = flood_date, by = "day")
      
      # Find time indices for all 3 days
      time_indices <- map_int(date_window, ~ {
        idx <- which(file_dates == .x)
        if (length(idx) == 0) return(NA_integer_) else return(idx[1])
      })
      
      # Extract precipitation for single day (flood day)
      flood_day_idx <- time_indices[3]  # Last day in window
      if (is.na(flood_day_idx)) {
        single_day_precip <- NA_real_
      } else {
        single_day_precip <- ncvar_get(nc, precip_var_name, 
                                      start = c(lon_idx, lat_idx, flood_day_idx),
                                      count = c(1, 1, 1))
      }
      
      # Extract precipitation for 3-day window
      if (any(is.na(time_indices))) {
        three_day_precip <- NA_real_
      } else {
        # Extract all 3 days at once
        precip_values <- map_dbl(time_indices, ~ {
          ncvar_get(nc, precip_var_name, 
                   start = c(lon_idx, lat_idx, .x),
                   count = c(1, 1, 1))
        })
        three_day_precip <- sum(precip_values, na.rm = TRUE)
      }
      
      return(tibble(
        precipitation_mm  = as.numeric(single_day_precip),
        precipitation3_mm = as.numeric(three_day_precip)
      ))
    })
    
    # Close NetCDF file
    nc_close(nc)
    
    # Add precipitation values to the data
    year_data |> bind_cols(precip_results)
    
  }, error = function(e) {
    warning("Error processing year ", year, ": ", e$message)
    year_data |> 
      mutate(precipitation_mm  = NA_real_, 
             precipitation3_mm = NA_real_)
  })
}

# Process each year separately
floods_with_precip <- floods_with_coords |>
  group_by(year) |>
  group_modify(~ extract_year_precipitation(.x, .y$year)) |>
  ungroup()

## Processing year 1948 with 37 flood events...
## Processing year 1949 with 13 flood events...
## Processing year 1950 with 38 flood events...
## Processing year 1951 with 31 flood events...
## Processing year 1952 with 37 flood events...
## Processing year 1953 with 38 flood events...
## Processing year 1954 with 38 flood events...
## Processing year 1955 with 2 flood events...
## Processing year 1956 with 38 flood events...
## Processing year 1957 with 27 flood events...
## Processing year 1958 with 34 flood events...
## Processing year 1959 with 38 flood events...
## Processing year 1960 with 34 flood events...
## Processing year 1961 with 38 flood events...
## Processing year 1962 with 36 flood events...
## Processing year 1963 with 38 flood events...
## Processing year 1964 with 38 flood events...
## Processing year 1965 with 35 flood events...
## Processing year 1966 with 40 flood events...
## Processing year 1967 with 40 flood events...
## Processing year 1968 with 41 flood events...
## Processing year 1969 with 41 flood events...
## Processing year 1970 with 36 flood events...
## Processing year 1971 with 23 flood events...
## Processing year 1972 with 40 flood events...
## Processing year 1973 with 5 flood events...
## Processing year 1974 with 38 flood events...
## Processing year 1975 with 36 flood events...
## Processing year 1976 with 40 flood events...
## Processing year 1977 with 18 flood events...
## Processing year 1978 with 39 flood events...
## Processing year 1979 with 31 flood events...
## Processing year 1980 with 31 flood events...
## Processing year 1981 with 37 flood events...
## Processing year 1982 with 33 flood events...
## Processing year 1983 with 33 flood events...
## Processing year 1984 with 34 flood events...
## Processing year 1985 with 38 flood events...
## Processing year 1986 with 16 flood events...
## Processing year 1987 with 22 flood events...
## Processing year 1988 with 32 flood events...
## Processing year 1989 with 38 flood events...
## Processing year 1990 with 38 flood events...
## Processing year 1991 with 21 flood events...
## Processing year 1992 with 23 flood events...
## Processing year 1993 with 33 flood events...
## Processing year 1994 with 38 flood events...
## Processing year 1995 with 32 flood events...
## Processing year 1996 with 37 flood events...
## Processing year 1997 with 26 flood events...
## Processing year 1998 with 35 flood events...
## Processing year 1999 with 34 flood events...
## Processing year 2000 with 35 flood events...
## Processing year 2001 with 35 flood events...
## Processing year 2002 with 34 flood events...
## Processing year 2003 with 33 flood events...
## Processing year 2004 with 19 flood events...
## Processing year 2005 with 31 flood events...
## Processing year 2006 with 11 flood events...
## Processing year 2007 with 30 flood events...
## Processing year 2008 with 21 flood events...
## Processing year 2009 with 17 flood events...
## Processing year 2010 with 33 flood events...
## Processing year 2011 with 34 flood events...
## Processing year 2012 with 20 flood events...
## Processing year 2013 with 33 flood events...
## Processing year 2014 with 23 flood events...
## Processing year 2015 with 36 flood events...
## Processing year 2016 with 35 flood events...
## Processing year 2017 with 33 flood events...
## Processing year 2018 with 36 flood events...
## Processing year 2019 with 34 flood events...
## Processing year 2020 with 18 flood events...
## Processing year 2021 with 35 flood events...
## Processing year 2022 with 36 flood events...
## Processing year 2023 with 26 flood events...
## Processing year 2024 with 29 flood events...

head(floods_with_precip)

## # A tibble: 6 × 14
##    year site_no  station_name max_flood_date max_flood_discharge max_flood_month
##   <int> <fct>    <fct>        <date>                       <int>           <int>
## 1  1948 03050000 TYGART VALL… 1948-02-14                    7640               2
## 2  1948 03050500 TYGART VALL… 1948-02-14                    8810               2
## 3  1948 03051000 TYGART VALL… 1948-02-14                   13200               2
## 4  1948 03052000 MIDDLE FORK… 1948-02-14                    8470               2
## 5  1948 03052500 SAND RUN NE… 1948-03-24                    1320               3
## 6  1948 03053500 BUCKHANNON … 1948-02-14                   10300               2
## # ℹ 8 more variables: max_flood_day <int>, season <fct>, station_nm <fct>,
## #   dec_lat_va <dbl>, dec_long_va <dbl>, drain_area_va <dbl>,
## #   precipitation_mm <dbl>, precipitation3_mm <dbl>

Validate and Summarize Results

# Check extraction results
extraction_summary <- floods_with_precip |>
  summarise(
    total_events = n(),
    successful_1day = sum(!is.na(precipitation_mm)),
    successful_3day = sum(!is.na(precipitation3_mm)),
    failed_1day     = sum(is.na(precipitation_mm)),
    failed_3day     = sum(is.na(precipitation3_mm)),
    success_rate_1day = successful_1day / total_events * 100,
    success_rate_3day = successful_3day / total_events * 100,
    mean_1day_precip   = mean(precipitation_mm,  na.rm = TRUE),
    mean_3day_precip   = mean(precipitation3_mm, na.rm = TRUE),
    median_1day_precip = median(precipitation_mm,  na.rm = TRUE),
    median_3day_precip = median(precipitation3_mm, na.rm = TRUE),
    .groups = "drop"
  )

print(extraction_summary)

## # A tibble: 1 × 11
##   total_events successful_1day successful_3day failed_1day failed_3day
##          <int>           <int>           <int>       <int>       <int>
## 1         2416            2383            2383          33          33
## # ℹ 6 more variables: success_rate_1day <dbl>, success_rate_3day <dbl>,
## #   mean_1day_precip <dbl>, mean_3day_precip <dbl>, median_1day_precip <dbl>,
## #   median_3day_precip <dbl>

# Summary by year
year_summary <- floods_with_precip |>
  group_by(year) |>
  summarise(
    events = n(),
    successful_1day = sum(!is.na(precipitation_mm)),
    successful_3day = sum(!is.na(precipitation3_mm)),
    mean_1day_precip = mean(precipitation_mm,  na.rm = TRUE),
    mean_3day_precip = mean(precipitation3_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(year)

print(year_summary)

## # A tibble: 77 × 6
##     year events successful_1day successful_3day mean_1day_precip
##    <int>  <int>           <int>           <int>            <dbl>
##  1  1948     37              37              37             19.7
##  2  1949     13              13              13             20.6
##  3  1950     38              38              38             34.4
##  4  1951     31              31              31             28.8
##  5  1952     37              37              37             23.4
##  6  1953     38              38              38             16.9
##  7  1954     38              38              38             50.1
##  8  1955      2               2               2             44.5
##  9  1956     38              38              38             46.6
## 10  1957     27              27              27             32.3
## # ℹ 67 more rows
## # ℹ 1 more variable: mean_3day_precip <dbl>

# Check how many Jan 1-2 events were skipped
jan12_events <- floods_with_precip |>
  filter(month(max_flood_date) == 1, day(max_flood_date) <= 2) |>
  nrow()

cat("Events on Jan 1-2 (skipped for 3-day calculation):", jan12_events, "\n")

## Events on Jan 1-2 (skipped for 3-day calculation): 33

Export Results

# Prepare final dataset for export
final_dataset <- floods_with_precip |>
  select(
    site_no, station_name, year, max_flood_date, 
    max_flood_discharge, max_flood_month, max_flood_day,
    precipitation_mm, precipitation3_mm
  )

# Write to CSV
output_file <- "max_annual_floods_with_precipitation.csv"
write_csv(final_dataset, output_file)

# Get unique sites from the dataframe
unique_sites <- floods_with_precip |>
  select(site_no, station_name) |>
  distinct() |>
  arrange(site_no)

cat("Results written to:", output_file, "\n")

## Results written to: max_annual_floods_with_precipitation.csv

cat("Final dataset contains", nrow(final_dataset), "flood events for", nrow(unique_sites), "sites\n")

## Final dataset contains 2416 flood events for 47 sites

# Basic plots to verify the data looks reasonable

# Distribution of 1-day precipitation
p1 <- floods_with_precip |>
  filter(!is.na(precipitation_mm)) |>
  ggplot(aes(x = precipitation_mm)) +
  geom_histogram(bins = 50, alpha = 0.7, fill = "steelblue") +
  labs(
    title = "1-Day Precipitation on Flood Days",
    x = "Precipitation (mm)",
    y = "Count"
  ) +
  theme_minimal()

# Distribution of 3-day precipitation
p2 <- floods_with_precip |>
  filter(!is.na(precipitation3_mm)) |>
  ggplot(aes(x = precipitation3_mm)) +
  geom_histogram(bins = 50, alpha = 0.7, fill = "darkgreen") +
  labs(
    title = "3-Day Precipitation Totals",
    x = "3-Day Precipitation (mm)",
    y = "Count"
  ) +
  theme_minimal()

print(p1)
print(p2)

# Comparison of 1-day vs 3-day precipitation
floods_with_precip |>
  filter(!is.na(precipitation_mm), !is.na(precipitation3_mm)) |>
  ggplot(aes(x = precipitation_mm, y = precipitation3_mm)) +
  geom_point(alpha = 0.6, color = "purple") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "1-Day vs 3-Day Precipitation",
    x = "1-Day Precipitation (mm)",
    y = "3-Day Precipitation (mm)",
    subtitle = "Dashed line shows 1:1 relationship"
  ) +
  theme_minimal()

# Precipitation vs discharge relationship (using 3-day totals)
plot_data <- floods_with_precip |>
  filter(!is.na(precipitation3_mm), !is.na(max_flood_discharge))

# Sample for plotting if too many points
if (nrow(plot_data) > 1000) {
  plot_data <- plot_data |> slice_sample(n = 1000)
}

plot_data |>
  ggplot(aes(x = precipitation3_mm, y = max_flood_discharge)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "3-Day Precipitation vs Maximum Flood Discharge",
    x = "3-Day Precipitation (mm)",
    y = "Maximum Flood Discharge"
  ) +
  theme_minimal()

# Create plots subdirectory if it doesn't exist
if (!dir.exists("plots")) {
  dir.create("plots")
}

Per-Site Plots

Precipitation vs Discharge

# Function to create scatter plot for a single site
plot_site_precipitation <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have data to plot
  plot_data <- site_data |>
    filter(!is.na(precipitation3_mm), !is.na(max_flood_discharge), !is.na(season))
  
  if (nrow(plot_data) == 0) {
    cat("No valid data for site", site_number, "-", station_name, "\n")
    return(NULL)
  }
  
  # Create the plot
  p <- plot_data |>
    ggplot(aes(x = precipitation3_mm, y = max_flood_discharge, color = season)) +
    geom_point(size = 2.5, alpha = 0.7) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
    #stat_poly_eq(aes(label = after_stat(eq.label)), parse = TRUE, color = "black", size = 3.5, label.x = 0.05, label.y = 0.95) +
    scale_color_manual(
      name = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_y_continuous(labels = comma_format()) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      x = "3-Day Precipitation (mm)",
      y = "Flood Discharge (cfs)",
      subtitle = paste0("n = ", nrow(plot_data), " flood events")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position = "right"
    )
  
  return(p)
}


#cat("Creating plots for", nrow(unique_sites), "unique sites...\n")

# Loop through each site and create plots
site_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  # Filter data for current site
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  # Create plot
  plot_obj <- plot_site_precipitation(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    # Store plot in list
    site_plots[[current_site]] <- plot_obj
    
    # Print plot
    print(plot_obj)
    
    # Optional: save individual plots to files
    ggsave(
      file.path("plots", paste0("site_", current_site, "_precipitation_discharge.png")), 
      plot_obj, width = 10, height = 6, dpi = 300)
  }
  
  #cat("Completed site", i, "of", nrow(unique_sites), "\n")
}

cat("All site plots completed!\n")

## All site plots completed!

Max Floods By Year

#' Create temporal trends scatterplot for a single site
#' 
#' This function creates a scatterplot showing how maximum annual flood 
#' discharge varies over time. Each point represents a single flood event,
#' colored by season with an optional trend line.
#' 
#' @param site_data A data frame containing flood events for a single site.
#'   Must include columns: `year`, `max_flood_discharge`, `season`
#' @param site_number Character/factor. The site number for labeling
#' @param station_name Character/factor. The station name for labeling
#' 
#' @return A ggplot object showing temporal trends, or NULL if insufficient data
#' 
#' @details 
#' **Plot Features:**
#' - Scatterplot with each point representing one flood event per year
#' - Points colored by season using consistent color scheme
#' - Optional trend line to show long-term patterns
#' - Y-axis formatted with comma separators for readability
#' 
#' **Minimum Data Requirements:**
#' - Requires at least 10 flood events with valid years, discharges, and seasons
#' 
plot_site_temporal_trends <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have sufficient data
  plot_data <- site_data |>
    filter(!is.na(year), !is.na(max_flood_discharge), !is.na(season))
  
  if (nrow(plot_data) < 10) {
    cat("Insufficient data for site", site_number, "-", station_name, "(", nrow(plot_data), "events)\n")
    return(NULL)
  }
  
  # Calculate year range for better x-axis scaling
  year_range <- range(plot_data$year, na.rm = TRUE)
  year_span <- diff(year_range)
  
  # Create the plot
  p <- plot_data |>
    ggplot(aes(x = year, y = max_flood_discharge, color = season)) +
    geom_point(size = 2.5, alpha = 0.7) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
    scale_color_manual(
      name = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_x_continuous(
      breaks = if (year_span <= 20) seq(year_range[1], year_range[2], by = 2) else waiver(),
      minor_breaks = if (year_span <= 40) seq(year_range[1], year_range[2], by = 1) else waiver()
    ) +
    scale_y_continuous(labels = comma_format()) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      x = "Year",
      y = "Flood Discharge (cfs)",
      subtitle = paste0("Temporal trends in maximum annual floods (n = ", nrow(plot_data), " events)")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position = "right",
      panel.grid.minor.x = element_line(color = "grey90", linewidth = 0.3)
    )
  
  return(p)
}



#cat("Creating temporal trends plots for", nrow(unique_sites), "unique sites...\n")

# Loop through each site and create temporal trends plots
temporal_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  # Filter data for current site
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  # Create plot
  plot_obj <- plot_site_temporal_trends(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    # Store plot in list
    temporal_plots[[current_site]] <- plot_obj
    
    # Print plot
    print(plot_obj)
    
    # Save plot
    ggsave(
      file.path("plots", paste0("site_", current_site, "_temporal_trends.png")), 
      plot_obj, width = 10, height = 6, dpi = 300
    )
  }
  
  #cat("Completed temporal trends plot", i, "of", nrow(unique_sites), "\n")
}

cat("All temporal trends plots completed!\n")

## All temporal trends plots completed!

Max Floods By Month

#' Create monthly flood distribution bar plot for a single site
#' 
#' This function creates a bar plot showing the distribution of maximum annual 
#' flood events across calendar months. Each bar represents the count of flood 
#' events occurring in that month, colored by season.
#' 
#' @param site_data A data frame containing flood events for a single site.
#'   Must include columns: `max_flood_month`, `season`
#' @param site_number Character/factor. The site number for labeling
#' @param station_name Character/factor. The station name for labeling
#' 
#' @return A ggplot object showing monthly flood distribution, or NULL if insufficient data
#' 
#' @details 
#' **Plot Features:**
#' - Bar plot with months on x-axis and event counts on y-axis
#' - Bars colored by season using consistent color scheme
#' - All 12 months shown even if no events occurred
#' - Month abbreviations on x-axis for readability
#' 
#' **Season Mapping:**
#' - Winter: December, January, February
#' - Spring: March, April, May  
#' - Summer: June, July, August
#' - Autumn: September, October, November
#' 
#' **Minimum Data Requirements:**
#' - Requires at least 5 flood events with valid months
#' 
plot_site_monthly_distribution <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have sufficient data
  plot_data <- site_data |>
    filter(!is.na(max_flood_month))
  
  if (nrow(plot_data) < 5) {
    cat("Insufficient data for site", site_number, "-", station_name, "(", nrow(plot_data), "events)\n")
    return(NULL)
  }
  
  # Create complete month reference with season mapping
  month_reference <- tibble(
    month_num = 1:12,
    month_name = month.abb,
    season = case_when(
      month_num %in% c(12,  1,  2) ~ "Winter",
      month_num %in% c( 3,  4,  5) ~ "Spring", 
      month_num %in% c( 6,  7,  8) ~ "Summer",
      month_num %in% c( 9, 10, 11) ~ "Autumn"
    )
  )
  
  # Count events by month and join with complete month reference
  monthly_counts <- plot_data |>
    count(max_flood_month, name = "flood_count") |>
    right_join(month_reference, by = c("max_flood_month" = "month_num")) |>
    mutate(
      flood_count = replace_na(flood_count, 0),
      month_name  = factor(month_name, levels = month.abb)
    )
  
  # Create the plot
  p <- monthly_counts |>
    ggplot(aes(x = month_name, y = flood_count, fill = season)) +
    geom_col(alpha = 0.8, color = "white", linewidth = 0.3) +
    scale_fill_manual(
      name = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_y_continuous(
      breaks = function(x) seq(0, max(x), by = max(1, ceiling(max(x)/8))),
      expand = expansion(mult = c(0, 0.05))
    ) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      x = "Month",
      y = "Number of Flood Events",
      subtitle = paste0("Monthly distribution of maximum annual floods (n = ", sum(monthly_counts$flood_count), " events)")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position = "right",
      panel.grid.major.x = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text.x = element_text(angle = 0, hjust = 0.5)
    )
  
  return(p)
}


#cat("Creating monthly distribution plots for", nrow(unique_sites), "unique sites...\n")

# Loop through each site and create monthly distribution plots
monthly_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  # Filter data for current site
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  # Create plot
  plot_obj <- plot_site_monthly_distribution(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    # Store plot in list
    monthly_plots[[current_site]] <- plot_obj
    
    # Print plot
    print(plot_obj)
    
    # Save plot
    ggsave(
      file.path("plots", paste0("site_", current_site, "_monthly_distribution.png")), 
      plot_obj, width = 10, height = 6, dpi = 300
    )
  }
  
  #cat("Completed monthly distribution plot", i, "of", nrow(unique_sites), "\n")
}

cat("All monthly distribution plots completed!\n")

## All monthly distribution plots completed!

# Function 2: Faceted scatter plots by season
plot_site_faceted <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have data to plot
  plot_data <- site_data |>
    filter(!is.na(precipitation3_mm), !is.na(max_flood_discharge), !is.na(season))
  
  if (nrow(plot_data) == 0) {
    cat("No valid data for site", site_number, "-", station_name, "\n")
    return(NULL)
  }
  
  # Calculate reasonable axis limits based on actual data
  x_max <- max(plot_data$precipitation3_mm, na.rm = TRUE) * 1.05
  y_max <- max(plot_data$max_flood_discharge, na.rm = TRUE) * 1.05
  
  # Create the faceted scatter plot
  p <- plot_data |>
    ggplot(aes(x = precipitation3_mm, y = max_flood_discharge)) +
    geom_point(aes(color = season), size = 2, alpha = 0.7) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 0.8) +
    facet_wrap(~season, scales = "fixed", ncol = 2) +
    coord_cartesian(xlim = c(0, x_max), ylim = c(0, y_max)) +  # Constrain axes
    scale_color_manual(
      name = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_y_continuous(labels = comma_format()) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      x = "3-Day Precipitation (mm)",
      y = "Flood Discharge (cfs)",
      subtitle = paste0("Seasonal precipitation-discharge relationships (n = ", nrow(plot_data), " events)")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position = "none",
      strip.text = element_text(face = "bold")
    )
  
  return(p)
}



# Loop 2: Create faceted scatter plots for each site
#cat("Creating faceted scatter plots for", nrow(unique_sites), "unique sites...\n")

faceted_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  plot_obj <- plot_site_faceted(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    faceted_plots[[current_site]] <- plot_obj
    print(plot_obj)
    
    # Save plot
    ggsave(
      file.path("plots", paste0("site_", current_site, "_faceted_seasonal.png")),
      plot_obj, width = 10, height = 8, dpi = 300)
  }
  
  #cat("Completed faceted plot", i, "of", nrow(unique_sites), "\n")
}

Flood Discharge By Season

# Function 3: Ridge plots of discharge by season
plot_site_ridges <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have data to plot
  plot_data <- site_data |>
    filter(!is.na(max_flood_discharge), !is.na(season))
  
  if (nrow(plot_data) == 0) {
    cat("No valid data for site", site_number, "-", station_name, "\n")
    return(NULL)
  }
  
  # Create the ridge plot
  p <- plot_data |>
    ggplot(aes(x = max_flood_discharge, y = season, fill = season)) +
    geom_density_ridges(alpha = 0.7, scale = 1.2, rel_min_height = 0.01) +
    scale_fill_manual(
      name = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_x_continuous(labels = comma_format()) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      x = "Flood Discharge (cfs)",
      y = "Season",
      subtitle = paste0("Seasonal flood discharge distributions (n = ", nrow(plot_data), " events)")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position = "none"
    )
  
  return(p)
}


# Loop 3: Create ridge plots for each site
#cat("Creating ridge plots for", nrow(unique_sites), "unique sites...\n")

ridge_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  plot_obj <- plot_site_ridges(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    ridge_plots[[current_site]] <- plot_obj
    print(plot_obj)
    
    # Save plot
    ggsave(
      file.path("plots", paste0("site_", current_site, "_ridges_seasonal.png")),
      plot_obj, width = 8, height = 6, dpi = 300)
  }
  
  #cat("Completed ridge plot", i, "of", nrow(unique_sites), "\n")
}

Floods By Day Of Year

#' Create day-of-year flood discharge scatterplot for a single site
#' 
#' This function creates a scatterplot showing the timing and magnitude of 
#' all annual maximum flood events throughout the calendar year. Each point
#' represents a single flood event, colored by season.
#' 
#' @param site_data A data frame containing flood events for a single site.
#'   Must include columns: `max_flood_date`, `max_flood_discharge`, `season`
#' @param site_number Character/factor. The site number for labeling
#' @param station_name Character/factor. The station name for labeling
#' 
#' @return A ggplot object showing day-of-year flood patterns, or NULL if insufficient data
#' 
#' @details 
#' **Data Processing:**
#' - Converts February 29 dates to February 28 for consistent 365-day year
#' - Calculates day-of-year (1-365) for each flood event
#' - Uses existing season classifications for color coding
#' 
#' **Plot Features:**
#' - Scatterplot with each point representing one flood event
#' - Points colored by season using consistent color scheme
#' - Seasonal background shading for visual reference
#' - Logarithmic y-axis to handle wide discharge ranges
#' 
#' **Minimum Data Requirements:**
#' - Requires at least 10 flood events with valid dates and discharges
#' 
#' @examples
#' \dontrun{
#' # Example site_data structure:
#' site_floods <- data.frame(
#'   max_flood_date = as.Date(c("2020-03-15", "2021-07-22", "2022-12-01")),
#'   max_flood_discharge = c(15000, 8500, 12000),
#'   season = c("Spring", "Summer", "Winter")
#' )
#' 
#' # Create day-of-year scatterplot
#' plot <- plot_site_day_of_year(site_floods, "12345678", "Example River")
#' }
#' 
plot_site_day_of_year <- function(site_data, site_number, station_name) {
  
  # Filter out NA values and ensure we have sufficient data
  plot_data <- site_data |>
    filter(!is.na(max_flood_date), !is.na(max_flood_discharge), !is.na(season))
  
  if (nrow(plot_data) < 10) {
    cat("Insufficient data for site", site_number, "-", station_name, "(", nrow(plot_data), "events)\n")
    return(NULL)
  }
  
  # Convert dates to day-of-year, handling leap years
  plot_data <- plot_data |>
    mutate(
      # Convert Feb 29 to Feb 28 for consistency
      adjusted_date = if_else(
        month(max_flood_date) == 2 & day(max_flood_date) == 29,
        as.Date(paste0(year(max_flood_date), "-02-28")),
        max_flood_date
      ),
      # Calculate day of year (1-365) and create date labels for x-axis
      day_of_year = as.numeric(format(adjusted_date, "%j")),
      date_label  = as.Date(paste0("2020-01-01")) + days(day_of_year - 1)
    )
  
  # Define seasonal boundaries for background shading
  seasonal_bounds <- tribble(
    ~season, ~start_day, ~end_day, ~color,
    "Winter",   1,  59, "#E6F3FF",    # Dec 21 - Feb 28 (approx)
    "Spring",  60, 151, "#E6FFE6",    # Mar 1 - May 31 (approx) 
    "Summer", 152, 243, "#FFE6E6",    # Jun 1 - Aug 31 (approx)
    "Autumn", 244, 334, "#FFF0E6",    # Sep 1 - Nov 30 (approx)
    "Winter", 335, 365, "#E6F3FF"     # Dec 1 - Dec 31 (approx)
  )
  
  # Create the plot
  p <- ggplot(plot_data, aes(x = date_label)) +
    # Add seasonal background shading
    geom_rect(
      data = seasonal_bounds,
      aes(xmin = as.Date("2020-01-01") + days(start_day - 1),
          xmax = as.Date("2020-01-01") + days(end_day - 1),
          ymin = -Inf, ymax = Inf),
      fill = seasonal_bounds$color,
      alpha = 0.3,
      inherit.aes = FALSE
    ) +
    # Scatterplot points colored by season
    geom_point(aes(y = max_flood_discharge, color = season), 
               size = 2.5, alpha = 0.7) +
    scale_color_manual(
      name   = "Season",
      values = c("Winter" = "#2166ac", "Spring" = "#5aae61", "Summer" = "#d73027", "Autumn" = "#762a83")
    ) +
    scale_x_date(
      date_labels = "%b",
      date_breaks = "1 month",
      expand = c(0.01, 0.01)
    ) +
    scale_y_log10(
      labels = scales::comma_format(),
      expand = c(0.02, 0.02)
    ) +
    labs(
      title = paste0("Site ", site_number, ": ", station_name),
      subtitle = paste0("Seasonal timing of maximum annual floods (n = ", nrow(plot_data), " events)"),
      x = "Month",
      y = "Flood Discharge (cfs, log scale)"
    ) +
    theme_minimal() +
    theme(
      plot.title    = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10),
      legend.position  = "right",
      panel.grid.minor = element_blank(),
      axis.text.x = element_text(angle = 0, hjust = 0.5)
    )
  
  return(p)
}



#cat("Creating day-of-year analysis plots for", nrow(unique_sites), "unique sites...\n")

# Loop through each site and create day-of-year plots
day_of_year_plots <- list()

for (i in 1:nrow(unique_sites)) {
  current_site <- unique_sites$site_no[i]
  current_name <- unique_sites$station_name[i]
  
  # Filter data for current site
  site_data <- floods_with_precip |>
    filter(site_no == current_site)
  
  # Create plot
  plot_obj <- plot_site_day_of_year(site_data, current_site, current_name)
  
  if (!is.null(plot_obj)) {
    # Store plot in list
    day_of_year_plots[[current_site]] <- plot_obj
    
    # Print plot
    print(plot_obj)
    
    # Save plot
    ggsave(
      file.path("plots", paste0("site_", current_site, "_day_of_year_analysis.png")), 
      plot_obj, width = 12, height = 8, dpi = 300
    )
  }
  
  #cat("Completed day-of-year analysis", i, "of", nrow(unique_sites), "\n")
}

Monongahela Watershed Flood Analysis

David A. Riggs <dar0009@mix.wvu.edu>

2025-06-17

Load Required Libraries

Load and Prepare Data

Extract Precipitation Data by Year

Validate and Summarize Results

Export Results

Per-Site Plots

Precipitation vs Discharge

Max Floods By Year

Max Floods By Month

Flood Discharge By Season

Floods By Day Of Year