Standard FFA using Annual Maxima

The Takhini River near Whitehorse (09AC001) will be used as a case study to demonstrate a standard FFA using annual maximum flows. Begin by loading the packages used in this investigation.

# Extensions to the R language:
library(tidyverse)
library(glue)
# Modelling extensions:
library(broom)
library(Kendall)
library(quantreg)
library(goftest)
# Data retrieval:
library(tidyhydat)
# For Probability Distributions:
library(distionary)
library(famish)
library(uscore)
library(bgcffa) # temporary

It will also be helpful to specify up front some values of interest.

# Log spacing, for plotting
log_breaks <- unlist(map(-10:10, \(x) 1:10 * 10^x))
# Flow label, for plotting
flow_label_instant <- expression(paste("Instantaneous Peak Flow ", (m^3/s)))
flow_label_daily <- expression(paste("Daily Peak Flow ", (m^3/s)))
# Grid of return periods on a log grid, for plotting.
rp_grid <- exp(seq(log(1.01), log(500), length.out = 500))
# Return periods of interest.
rp_interest <- c(2, 5, 10, 20, 50, 100, 200, 500)

Relevant Data Sources

Three datasets are needed for this analysis, and can be downloaded using tidyhydat:

  • Annual maxima of instantaneous flows:
(annual_instant <- hy_annual_instant_peaks("09AC001") |>
  filter(PEAK_CODE == "MAX", Parameter == "Flow") |> 
  mutate(Year = year(Date)) |> 
  select(Year, Date, Flow_m3s = Value, Symbol))
##   Queried from version of HYDAT released on 2024-07-18
##    Observations:                      52
##    Measurement flags:                 0
##    Date range:                        1965-07-18 to 2019-07-06 
## # A tibble: 52 × 4
##     Year Date       Flow_m3s Symbol
##    <dbl> <date>        <dbl> <chr> 
##  1  1965 1965-07-18      204 <NA>  
##  2  1966 1966-07-30      210 <NA>  
##  3  1967 1967-06-26      248 <NA>  
##  4  1968 1968-07-08      178 <NA>  
##  5  1969 1969-06-23      247 <NA>  
##  6  1971 1971-08-05      260 <NA>  
##  7  1972 1972-08-10      200 <NA>  
##  8  1973 1973-07-27      171 <NA>  
##  9  1974 1974-08-06      228 <NA>  
## 10  1975 1975-07-13      292 <NA>  
## # ℹ 42 more rows
  • Annual maxima of daily flows:
(annual_daily <- hy_annual_stats("09AC001") |> 
  filter(Parameter == "Flow", Sum_stat == "MAX") |> 
  select(Year, Date, Flow_m3s = Value, Symbol))
##   Queried from version of HYDAT released on 2024-07-18
##    Observations:                      70
##    Measurement flags:                 4
##    Date range:                        1949-09-02 to 2019-07-07 
## # A tibble: 70 × 4
##     Year Date       Flow_m3s Symbol
##    <int> <date>        <dbl> <chr> 
##  1  1949 1949-09-02      487 <NA>  
##  2  1950 1950-06-22      261 <NA>  
##  3  1951 1951-07-21      181 <NA>  
##  4  1952 1952-07-28      214 <NA>  
##  5  1953 1953-07-04      343 <NA>  
##  6  1954 1954-07-22      158 <NA>  
##  7  1955 1955-07-14      238 <NA>  
##  8  1956 1956-07-28      184 <NA>  
##  9  1957 1957-06-28      272 <NA>  
## 10  1958 1958-07-11      159 <NA>  
## # ℹ 60 more rows
  • All daily flows:
(all_daily <- hy_daily_flows("09AC001") |> 
  filter(Parameter == "Flow") |> 
  mutate(Year = year(Date)) |> 
  select(Year, Date, Flow_m3s = Value, Symbol))
##   Queried from version of HYDAT released on 2024-07-18
##    Observations:                      25,570
##    Measurement flags:                 14,664
##    Date range:                        1948-08-01 to 2019-12-31 
## # A tibble: 25,570 × 4
##     Year Date       Flow_m3s Symbol
##    <dbl> <date>        <dbl> <chr> 
##  1  1948 1948-08-01      193 E     
##  2  1948 1948-08-02      193 E     
##  3  1948 1948-08-03      193 E     
##  4  1948 1948-08-04      193 E     
##  5  1948 1948-08-05      225 <NA>  
##  6  1948 1948-08-06      220 <NA>  
##  7  1948 1948-08-07      211 <NA>  
##  8  1948 1948-08-08      207 <NA>  
##  9  1948 1948-08-09      206 <NA>  
## 10  1948 1948-08-10      196 <NA>  
## # ℹ 25,560 more rows

Dataset Expansion through Imputation

There are years where annual maxima have been recorded for daily flows, but where the instantaneous is missing. Data imputation is useful for extracting value from these flows.

Joining both datasets allows us to visualize and model relationships between daily and instantaneous flows.

annual_all <- full_join(
  annual_instant, annual_daily, by = "Year", suffix = c("_instant", "_daily")
) |> 
  arrange(Year)

At this point, whenever both the daily and instantaneous flows are available, we should check to see that they correspond to the same event. Calculating the difference in days (day_diff) helps with this evaluation; in doing so, we see the largest difference is 4 days in 2002, so each pairing is likely associated with the same event.

annual_all |> 
  mutate(day_diff = abs(difftime(Date_daily, Date_instant, units = "days"))) |> 
  arrange(desc(day_diff))
##   Queried from version of HYDAT released on 2024-07-18
##    Observations:                      70
## # A tibble: 70 × 8
##     Year Date_instant Flow_m3s_instant Symbol_instant Date_daily Flow_m3s_daily
##    <dbl> <date>                  <dbl> <chr>          <date>              <dbl>
##  1  2002 2002-07-26                151 <NA>           2002-07-30            148
##  2  1966 1966-07-30                210 <NA>           1966-07-27            208
##  3  1969 1969-06-23                247 <NA>           1969-06-22            246
##  4  1974 1974-08-06                228 <NA>           1974-08-07            225
##  5  1975 1975-07-13                292 <NA>           1975-07-14            289
##  6  1978 1978-07-15                156 <NA>           1978-07-16            156
##  7  1979 1979-07-06                250 <NA>           1979-07-07            249
##  8  1986 1986-07-24                260 <NA>           1986-07-23            259
##  9  1994 1994-08-13                173 <NA>           1994-08-12            172
## 10  1999 1999-06-26                211 <NA>           1999-06-27            211
## # ℹ 60 more rows
## # ℹ 2 more variables: Symbol_daily <chr>, day_diff <drtn>

Investigate the relationship between daily and instantaneous flows with a scatterplot. The dashed line is the one-to-one line for reference, and the blue solid line is a linear regression model forced through the origin.

annual_all |> 
  select(contains("Flow")) |> 
  drop_na() |> 
  ggplot(aes(Flow_m3s_daily, Flow_m3s_instant)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x - 1, linewidth = 0.5) +
  geom_point(alpha = 0.7) +
  labs(x = flow_label_daily, y = flow_label_instant) +
  theme_bw()

Use lm() to create the linear regression model for imputation.

imputation_model <- lm(Flow_m3s_instant ~ Flow_m3s_daily - 1, data = annual_all)

Use broom’s augment() function to append imputed values in a .fitted column, and use those whenever an instantaneous flow is missing.

ams_imputed <- broom::augment(imputation_model, newdata = annual_all) |> 
  mutate(
    Imputation_flag = if_else(
      is.na(Flow_m3s_instant), "Imputed", "Measured"
    ),
    Date = if_else(
      is.na(Flow_m3s_instant), Date_daily, Date_instant
    ),
    Flow_m3s = if_else(
      is.na(Flow_m3s_instant), .fitted, Flow_m3s_instant
    )
  ) |> 
  select(Year, Date, Flow_m3s, Imputation_flag) |> 
  drop_na()
# Number of flows in the imputed dataset:
num_flows <- nrow(ams_imputed)

Exploratory Analysis

Homogeneity

Look for distinct populations by visualizing peak timing during the year.

bgcffa::whenplotmag_rect(Date, Flow_m3s, data = ams_imputed) +
  labs(x = flow_label_instant)

The large 1949 flow occurred at the beginning of September, later than the typical late June – early August timing that is observed otherwise. It is possible that this flow arose from a process different from the rest. The rest of the flows visually form a single cluster, providing no evidence of additional processes.

A regime plot provides more context.

all_daily |> 
  ggplot(aes(yday(Date), Flow_m3s)) +
  geom_line(aes(group = Year, colour = Year == 1949), alpha = 0.5) +
  theme_bw() +
  labs(x = "Julian Day", y = flow_label_instant) +
  scale_colour_manual(values = c("black", "red3"), guide = "none")

This is an example of where local and indigenous knowledge would be useful, to further understand flow patterns conveyed by this river. If we believe that the 1949 peak did indeed arise from a distinct process, more data or information would be needed to model separate distributions for each process. In the absence of such information, the overall distribution of flows must be modelled directly.

Stationarity

The first check for stationarity should be a visual inspection. Here is a time series plot of flows.

bgcffa::ams_timeseries(Date, Flow_m3s, data = ams_imputed) +
  ylab(flow_label_instant)

The series appears to be stationary, aside from the 1949 outlier. If we believe that the 1949 flow resulted from a different process than the other flows (Section~@ref(homogeneity)), then the entire series is stationary as long as the process driving the 1949 is equally likely to occur on any given year.

This conclusion holds despite the Mann-Kendall test for stationarity suggesting otherwise. Here are the results of the test, with a p-value less than 0.05:

MannKendall(ams_imputed$Flow_m3s)
## tau = -0.163, 2-sided pvalue =0.046305

However, a non-homogeneous population is still stationary. The perceived non-stationarity results by chance that a single one of these alternative processes occurred, and at one end of the series. Removing this flow points towards stationarity by analysis of trend, both visually and inferentially with a large p-value:

ams_imputed |> 
  filter(Year != 1949) |> 
  pull(Flow_m3s) |> 
  MannKendall()
## tau = -0.139, 2-sided pvalue =0.093276

To further understand this, note that the 1949 flow could occur again in the future – say, next in the series – and a downward trend is no longer perceived.

Checking for linear trends in percentiles also shows no detectable trends in the data (this time, only focussing on the main peak flow process post 1949). This is a useful method for detecting non-stationarity in other aspects of the flow distribution, such as variance; however, it is limited to linear trends. The blue straight lines represent the fitted deciles (10th, 20th, …, 90th percentiles).

bgcffa::ams_timeseries(
  Date, Flow_m3s, data = filter(annual_instant, Year != 1949)
) +
  ylab(flow_label_instant) +
  geom_quantile(quantiles = 1:9 / 10)

Inferentially, the trends seen in these lines are likely a result of the observed sample, since the calculated confidence intervals (conf.low, conf.high) contain 0 for each decile (tau).

rq(
  Flow_m3s ~ Date, data = filter(annual_instant, Year != 1949), tau = 1:9 / 10
) |>
  broom::tidy() |> 
  filter(term == "Date")
## # A tibble: 9 × 5
##   term    estimate conf.low conf.high   tau
##   <chr>      <dbl>    <dbl>     <dbl> <dbl>
## 1 Date  -0.000549  -0.00311  0.00199    0.1
## 2 Date  -0.000641  -0.00339  0.00128    0.2
## 3 Date  -0.000894  -0.00272  0.00163    0.3
## 4 Date  -0.000810  -0.00197  0.000969   0.4
## 5 Date   0.0000832 -0.00334  0.00102    0.5
## 6 Date  -0.000717  -0.00318  0.00116    0.6
## 7 Date  -0.00180   -0.00369  0.00194    0.7
## 8 Date  -0.000205  -0.00379  0.00428    0.8
## 9 Date   0.00196   -0.00677  0.00439    0.9

Independence

There is no autocorrelation detected in the series. The blue dashed are expected to contain 95% of the calculated correlations if the series is independence (ignore the first line, which relates each flow to itself).

acf(ams_imputed$Flow_m3s)

Randomness

One aspect of randomness is missingness. Looking at the time series plot, there is a gap in 2018 that should be investigated. Here is a plot of flows for 2017, 2018, and 2019:

all_daily |> 
  filter(Year >= 2017, Year <= 2019) |> 
  ggplot(aes(Date, Flow_m3s)) +
  geom_point() +
  theme_bw() +
  ylab(flow_label_instant)

In this case, the missing flows correspond with the 2018 calendar year, suggesting that the missingness is not due to natural causes (such as a large flood) and therefore missing completely at random.

Modelling

Model Screening

Take a look at a histogram of flows to get a sense of the distribution shape, and the distribution families that might be appropriate.

(p_hist <- ggplot(ams_imputed, aes(Flow_m3s, after_stat(density))) +
  geom_histogram(bins = 20) +
  theme_bw() +
  xlab(flow_label_instant))

Distribution families with a right-skew are needed to fit these data. Try a GEV, LP3, and Gamma, fit by MLE and L-moments (where possible). Set the models up in a table:

(models <- tribble(
  ~ family, ~ method,
  "gev", "mle",
  "gev", "lmom",
  "lp3", "lmom",
  "gamma", "mle"
) |> 
  mutate(key = str_c(family, method, sep = "_")))
## # A tibble: 4 × 3
##   family method key      
##   <chr>  <chr>  <chr>    
## 1 gev    mle    gev_mle  
## 2 gev    lmom   gev_lmom 
## 3 lp3    lmom   lp3_lmom 
## 4 gamma  mle    gamma_mle

To fit just one model, use the fit_dst() function from the famish package. Here is a demonstration of the GEV fit my MLE (the one we will eventually select for use).

flow_distribution <- fit_dst(ams_imputed$Flow_m3s, "gev", method = "mle")

Applying this function to the models in the table, make a new column containing the fitted distributions.

(all_distributions <- models |> 
  mutate(distribution = map2(family, method, \(f, m) {
    fit_dst(ams_imputed$Flow_m3s, name = f, method = m)
  })))
## # A tibble: 4 × 4
##   family method key       distribution
##   <chr>  <chr>  <chr>     <list>      
## 1 gev    mle    gev_mle   <gev>       
## 2 gev    lmom   gev_lmom  <gev>       
## 3 lp3    lmom   lp3_lmom  <lpearsn3>  
## 4 gamma  mle    gamma_mle <gamma>

To compare the distributions’ density functions to the histogram, we could either plot the densities using stat_function(), or evaluate the densities on a fine grid and plot with geom_line(). Using the latter, set up a grid of flows:

flow_grid <- seq(150, 500, length.out = 500)

We can leverage the enframe_*() family of functions from the probaverse to evaluate the density functions.

enframe_density(flow_distribution, at = flow_grid)
## # A tibble: 500 × 2
##     .arg density
##    <dbl>   <dbl>
##  1  150  0.00293
##  2  151. 0.00306
##  3  151. 0.00320
##  4  152. 0.00334
##  5  153. 0.00349
##  6  154. 0.00363
##  7  154. 0.00377
##  8  155. 0.00392
##  9  156. 0.00407
## 10  156. 0.00422
## # ℹ 490 more rows
all_densities <- all_distributions |> 
  mutate(df = map(
    distribution, enframe_density, at = flow_grid, arg_name = "Flow_m3s"
  )) |> 
  select(!distribution) |> 
  unnest(df)

p_hist +
  geom_line(
    data = all_densities,
    mapping = aes(Flow_m3s, density, colour = key)
  )

Comparing the probability density functions to the histogram, all distributions look reasonable. This conformation to the data can be inferentially validated using a distribution test like the Kolmogorov-Smirnov (ks_pvalue) or Anderson-Darling (ad_pvalue) tests, both of which result in large p-values, suggesting they are all realistic models.

all_distributions |> 
  mutate(
    ks_pvalue = map_dbl(distribution, \(d) {
      ks.test(annual_instant$Flow_m3s, \(x) eval_cdf(d, at = x))$p.value
    }),
    ad_pvalue = map_dbl(distribution, \(d) {
      ad.test(annual_instant$Flow_m3s, \(x) eval_cdf(d, at = x))$p.value
    })
  )
## # A tibble: 4 × 6
##   family method key       distribution ks_pvalue ad_pvalue
##   <chr>  <chr>  <chr>     <list>           <dbl>     <dbl>
## 1 gev    mle    gev_mle   <gev>            0.629     0.597
## 2 gev    lmom   gev_lmom  <gev>            0.569     0.530
## 3 lp3    lmom   lp3_lmom  <lpearsn3>       0.556     0.516
## 4 gamma  mle    gamma_mle <gamma>          0.199     0.211

Model Selection

The model screening checked the distributions against the full dataset, but the upper tail of the distribution is what’s most important and what should now determine the final model selection.

Visually, we will compare return period estimates to the data. Return periods can be estimated from a distribution using enframe_return() (on either the grid rp_grid or hand-picked selection rp_interest we defined earlier).

(all_returns <- all_distributions |> 
  mutate(df = map(
    distribution, enframe_return, at = rp_grid, arg_name = "return_period", 
    fn_prefix = "Flow_m3s"
  )) |> 
  select(!distribution) |> 
  unnest(df))
## # A tibble: 2,000 × 5
##    family method key     return_period Flow_m3s
##    <chr>  <chr>  <chr>           <dbl>    <dbl>
##  1 gev    mle    gev_mle          1.01     140.
##  2 gev    mle    gev_mle          1.02     147.
##  3 gev    mle    gev_mle          1.04     151.
##  4 gev    mle    gev_mle          1.05     155.
##  5 gev    mle    gev_mle          1.06     157.
##  6 gev    mle    gev_mle          1.07     160.
##  7 gev    mle    gev_mle          1.09     162.
##  8 gev    mle    gev_mle          1.10     164.
##  9 gev    mle    gev_mle          1.12     166.
## 10 gev    mle    gev_mle          1.13     168.
## # ℹ 1,990 more rows

Compare the distributions to the empirical estimates of return periods.

(empirical_returns <- ams_imputed |> 
  mutate(return_period = uscore::rpscore(Flow_m3s)))
## # A tibble: 70 × 5
##     Year Date       Flow_m3s Imputation_flag return_period
##    <dbl> <date>        <dbl> <chr>                   <dbl>
##  1  1949 1949-09-02     492. Imputed                140.  
##  2  1950 1950-06-22     264. Imputed                  6.09
##  3  1951 1951-07-21     183. Imputed                  1.31
##  4  1952 1952-07-28     216. Imputed                  2.03
##  5  1953 1953-07-04     347. Imputed                 46.7 
##  6  1954 1954-07-22     160. Imputed                  1.10
##  7  1955 1955-07-14     240. Imputed                  2.98
##  8  1956 1956-07-28     186. Imputed                  1.36
##  9  1957 1957-06-28     275. Imputed                  7.37
## 10  1958 1958-07-11     161. Imputed                  1.12
## # ℹ 60 more rows

Compare the two in a plot, with the black dots representing the data.

ggplot(all_returns, aes(return_period, Flow_m3s)) +
  geom_line(aes(colour = key)) +
  geom_point(data = empirical_returns, size = 0.7, alpha = 0.7) +
  bgcffa::scale_x_gumbelRP("Return Period (years)") +
  ylab(flow_label_instant) +
  theme_bw() +
  scale_colour_discrete("Model")

The Gamma distribution may not be appropriate due to its incongruent concavity. For the others, the outlier makes the decision difficult. Ideally, the distribution would be built with awareness of the flood process, but in the absence of such information, choosing from the remaining distributions is like splitting hairs.

A quantitative investigation of fit can be conducted through the quantile score, calculated for each return period. This suggests the GEV fit by MLE is the best choice.

all_qscores <- all_returns |> 
  mutate(
    tau = 1 - 1 / return_period,
    qscore = map2_dbl(Flow_m3s, tau, \(flow, p) {
      mean(quantile_score(ams_imputed$Flow_m3s, flow, tau = p))
    })
  )
ggplot(all_qscores, aes(return_period, qscore)) +
  geom_line(aes(colour = key), alpha = 0.7) +
  scale_y_log10("Quantile Score", minor_breaks = log_breaks) +
  scale_x_log10(
    "Return Period (years)", minor_breaks = log_breaks, limits = c(10, NA)
  ) +
  scale_colour_discrete("Model") +
  theme_bw()

The GEV fit by MLE is further appealing because the return period of the 1949 flood event is the smallest, at roughly 500 years. This may still be unrealistic, and local knowledge would be useful to contextualize the frequency of the 1949 flood event. In the absence of such information, if it’s common practice to choose a design flow based on the 100- or 200-year flood, a more pragmatic choice in this case would be to use the 1949 flow, since it actually happened.

Confidence Intervals

Generate 1000 resampled flow datasets, keeping the same sample size as the original.

set.seed(42)
B <- 1000
(bootstrap_distributions <- tibble(resample_id = 1:B) |> 
  mutate(
    distribution = map(resample_id, \(i) {
      new_flows <- realise(flow_distribution, n = num_flows)
      fit_dst_gev(new_flows, method = "mle")
    })
  ))
## # A tibble: 1,000 × 2
##    resample_id distribution
##          <int> <list>      
##  1           1 <gev>       
##  2           2 <gev>       
##  3           3 <gev>       
##  4           4 <gev>       
##  5           5 <gev>       
##  6           6 <gev>       
##  7           7 <gev>       
##  8           8 <gev>       
##  9           9 <gev>       
## 10          10 <gev>       
## # ℹ 990 more rows

Now, the distributions can be evaluated as desired, and their spread represents the distribution for which confidence intervals can be computed. If it means anything to you, this is an estimate of the sampling distribution of the selected property.

For example, we will obtain 95% confidence intervals on a grid of return periods. First, evaluate the quantiles – enframing the quantile function is useful because it keeps track of the inputted return periods instead of just evaluating to the quantiles. Here are the quantile tables for each bootstrap resample.

(bootstrap_rp <- bootstrap_distributions |> 
  mutate(df = map(
    distribution, enframe_return, at = c(2, 5, 10, 20, 50, 100, 200, 500), 
    arg_name = "return_period", fn_prefix = "Flow_m3s"
  )) |> 
  select(!distribution) |> 
  unnest(df))
## # A tibble: 8,000 × 3
##    resample_id return_period Flow_m3s
##          <int>         <dbl>    <dbl>
##  1           1             2     228.
##  2           1             5     282.
##  3           1            10     316.
##  4           1            20     348.
##  5           1            50     388.
##  6           1           100     418.
##  7           1           200     446.
##  8           1           500     483.
##  9           2             2     217.
## 10           2             5     262.
## # ℹ 7,990 more rows

Let’s take a look at the spread of values corresponding to the 200-year return period. The 2.5% and 97.5% quantiles of these estimates form a 95% confidence interval.

bootstrap_rp |> 
  filter(return_period == 200) |>
  mutate(
    lower = quantile(Flow_m3s, 0.025), 
    upper = quantile(Flow_m3s, 0.975)
  ) |> 
  ggplot(aes(Flow_m3s)) +
  geom_histogram(bins = 30) +
  geom_vline(aes(xintercept = lower), linetype = "dashed", colour = "orange3") +
  geom_vline(aes(xintercept = upper), linetype = "dashed", colour = "orange3") +
  scale_x_continuous(expression(paste("200-year flood estimates ", (m^3/s)))) +
  theme_bw()

Compute the interval for each return period.

(rp_confint <- bootstrap_rp |> 
  group_by(return_period) |> 
  summarise(
    lower = quantile(Flow_m3s, 0.025),
    upper = quantile(Flow_m3s, 0.975)
  ))
## # A tibble: 8 × 3
##   return_period lower upper
##           <dbl> <dbl> <dbl>
## 1             2  201.  225.
## 2             5  242.  279.
## 3            10  267.  322.
## 4            20  290.  372.
## 5            50  315.  453.
## 6           100  332.  520.
## 7           200  348.  606.
## 8           500  365.  746.

Final Quantile Estimates

Using our already selected fit, we can enframe the return periods of our choice:

(return_periods <- enframe_return(
  flow_distribution, at = c(2, 5, 10, 20, 50, 100, 200, 500),
  arg_name = "return_period", fn_prefix = "Flow_m3s"
))
## # A tibble: 8 × 2
##   return_period Flow_m3s
##           <dbl>    <dbl>
## 1             2     212.
## 2             5     260.
## 3            10     294.
## 4            20     327.
## 5            50     373.
## 6           100     409.
## 7           200     447.
## 8           500     499.

We can join the confidence intervals to indicate sampling uncertainty.

(return_periods2 <- left_join(return_periods, rp_confint, by = "return_period"))
## # A tibble: 8 × 4
##   return_period Flow_m3s lower upper
##           <dbl>    <dbl> <dbl> <dbl>
## 1             2     212.  201.  225.
## 2             5     260.  242.  279.
## 3            10     294.  267.  322.
## 4            20     327.  290.  372.
## 5            50     373.  315.  453.
## 6           100     409.  332.  520.
## 7           200     447.  348.  606.
## 8           500     499.  365.  746.

On a plot with the data, here is what the estimates look like. You may choose to use a finer grid of return periods to make a smoother plot.

ggplot(return_periods2, aes(return_period, Flow_m3s)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5) +
  geom_line() +
  geom_point(data = empirical_returns, size = 0.7, alpha = 0.7) +
  scale_x_gumbelRP("Return Period (years)") +
  ylab(flow_label_instant) +
  theme_bw()