The Takhini River near Whitehorse (09AC001) will be used as a case study to demonstrate a standard FFA using annual maximum flows. Begin by loading the packages used in this investigation.
# Extensions to the R language:
library(tidyverse)
library(glue)
# Modelling extensions:
library(broom)
library(Kendall)
library(quantreg)
library(goftest)
# Data retrieval:
library(tidyhydat)
# For Probability Distributions:
library(distionary)
library(famish)
library(uscore)
library(bgcffa) # temporary
It will also be helpful to specify up front some values of interest.
# Log spacing, for plotting
log_breaks <- unlist(map(-10:10, \(x) 1:10 * 10^x))
# Flow label, for plotting
flow_label_instant <- expression(paste("Instantaneous Peak Flow ", (m^3/s)))
flow_label_daily <- expression(paste("Daily Peak Flow ", (m^3/s)))
# Grid of return periods on a log grid, for plotting.
rp_grid <- exp(seq(log(1.01), log(500), length.out = 500))
# Return periods of interest.
rp_interest <- c(2, 5, 10, 20, 50, 100, 200, 500)
Three datasets are needed for this analysis, and can be downloaded
using tidyhydat:
(annual_instant <- hy_annual_instant_peaks("09AC001") |>
filter(PEAK_CODE == "MAX", Parameter == "Flow") |>
mutate(Year = year(Date)) |>
select(Year, Date, Flow_m3s = Value, Symbol))
## Queried from version of HYDAT released on 2024-07-18
## Observations: 52
## Measurement flags: 0
## Date range: 1965-07-18 to 2019-07-06
## # A tibble: 52 × 4
## Year Date Flow_m3s Symbol
## <dbl> <date> <dbl> <chr>
## 1 1965 1965-07-18 204 <NA>
## 2 1966 1966-07-30 210 <NA>
## 3 1967 1967-06-26 248 <NA>
## 4 1968 1968-07-08 178 <NA>
## 5 1969 1969-06-23 247 <NA>
## 6 1971 1971-08-05 260 <NA>
## 7 1972 1972-08-10 200 <NA>
## 8 1973 1973-07-27 171 <NA>
## 9 1974 1974-08-06 228 <NA>
## 10 1975 1975-07-13 292 <NA>
## # ℹ 42 more rows
(annual_daily <- hy_annual_stats("09AC001") |>
filter(Parameter == "Flow", Sum_stat == "MAX") |>
select(Year, Date, Flow_m3s = Value, Symbol))
## Queried from version of HYDAT released on 2024-07-18
## Observations: 70
## Measurement flags: 4
## Date range: 1949-09-02 to 2019-07-07
## # A tibble: 70 × 4
## Year Date Flow_m3s Symbol
## <int> <date> <dbl> <chr>
## 1 1949 1949-09-02 487 <NA>
## 2 1950 1950-06-22 261 <NA>
## 3 1951 1951-07-21 181 <NA>
## 4 1952 1952-07-28 214 <NA>
## 5 1953 1953-07-04 343 <NA>
## 6 1954 1954-07-22 158 <NA>
## 7 1955 1955-07-14 238 <NA>
## 8 1956 1956-07-28 184 <NA>
## 9 1957 1957-06-28 272 <NA>
## 10 1958 1958-07-11 159 <NA>
## # ℹ 60 more rows
(all_daily <- hy_daily_flows("09AC001") |>
filter(Parameter == "Flow") |>
mutate(Year = year(Date)) |>
select(Year, Date, Flow_m3s = Value, Symbol))
## Queried from version of HYDAT released on 2024-07-18
## Observations: 25,570
## Measurement flags: 14,664
## Date range: 1948-08-01 to 2019-12-31
## # A tibble: 25,570 × 4
## Year Date Flow_m3s Symbol
## <dbl> <date> <dbl> <chr>
## 1 1948 1948-08-01 193 E
## 2 1948 1948-08-02 193 E
## 3 1948 1948-08-03 193 E
## 4 1948 1948-08-04 193 E
## 5 1948 1948-08-05 225 <NA>
## 6 1948 1948-08-06 220 <NA>
## 7 1948 1948-08-07 211 <NA>
## 8 1948 1948-08-08 207 <NA>
## 9 1948 1948-08-09 206 <NA>
## 10 1948 1948-08-10 196 <NA>
## # ℹ 25,560 more rows
There are years where annual maxima have been recorded for daily flows, but where the instantaneous is missing. Data imputation is useful for extracting value from these flows.
Joining both datasets allows us to visualize and model relationships between daily and instantaneous flows.
annual_all <- full_join(
annual_instant, annual_daily, by = "Year", suffix = c("_instant", "_daily")
) |>
arrange(Year)
At this point, whenever both the daily and instantaneous flows are
available, we should check to see that they correspond to the same
event. Calculating the difference in days (day_diff) helps
with this evaluation; in doing so, we see the largest difference is 4
days in 2002, so each pairing is likely associated with the same
event.
annual_all |>
mutate(day_diff = abs(difftime(Date_daily, Date_instant, units = "days"))) |>
arrange(desc(day_diff))
## Queried from version of HYDAT released on 2024-07-18
## Observations: 70
## # A tibble: 70 × 8
## Year Date_instant Flow_m3s_instant Symbol_instant Date_daily Flow_m3s_daily
## <dbl> <date> <dbl> <chr> <date> <dbl>
## 1 2002 2002-07-26 151 <NA> 2002-07-30 148
## 2 1966 1966-07-30 210 <NA> 1966-07-27 208
## 3 1969 1969-06-23 247 <NA> 1969-06-22 246
## 4 1974 1974-08-06 228 <NA> 1974-08-07 225
## 5 1975 1975-07-13 292 <NA> 1975-07-14 289
## 6 1978 1978-07-15 156 <NA> 1978-07-16 156
## 7 1979 1979-07-06 250 <NA> 1979-07-07 249
## 8 1986 1986-07-24 260 <NA> 1986-07-23 259
## 9 1994 1994-08-13 173 <NA> 1994-08-12 172
## 10 1999 1999-06-26 211 <NA> 1999-06-27 211
## # ℹ 60 more rows
## # ℹ 2 more variables: Symbol_daily <chr>, day_diff <drtn>
Investigate the relationship between daily and instantaneous flows with a scatterplot. The dashed line is the one-to-one line for reference, and the blue solid line is a linear regression model forced through the origin.
annual_all |>
select(contains("Flow")) |>
drop_na() |>
ggplot(aes(Flow_m3s_daily, Flow_m3s_instant)) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
geom_smooth(method = "lm", se = FALSE, formula = y ~ x - 1, linewidth = 0.5) +
geom_point(alpha = 0.7) +
labs(x = flow_label_daily, y = flow_label_instant) +
theme_bw()
Use lm() to create the linear regression model for
imputation.
imputation_model <- lm(Flow_m3s_instant ~ Flow_m3s_daily - 1, data = annual_all)
Use broom’s augment() function to append
imputed values in a .fitted column, and use those whenever
an instantaneous flow is missing.
ams_imputed <- broom::augment(imputation_model, newdata = annual_all) |>
mutate(
Imputation_flag = if_else(
is.na(Flow_m3s_instant), "Imputed", "Measured"
),
Date = if_else(
is.na(Flow_m3s_instant), Date_daily, Date_instant
),
Flow_m3s = if_else(
is.na(Flow_m3s_instant), .fitted, Flow_m3s_instant
)
) |>
select(Year, Date, Flow_m3s, Imputation_flag) |>
drop_na()
# Number of flows in the imputed dataset:
num_flows <- nrow(ams_imputed)
Look for distinct populations by visualizing peak timing during the year.
bgcffa::whenplotmag_rect(Date, Flow_m3s, data = ams_imputed) +
labs(x = flow_label_instant)
The large 1949 flow occurred at the beginning of September, later than the typical late June – early August timing that is observed otherwise. It is possible that this flow arose from a process different from the rest. The rest of the flows visually form a single cluster, providing no evidence of additional processes.
A regime plot provides more context.
all_daily |>
ggplot(aes(yday(Date), Flow_m3s)) +
geom_line(aes(group = Year, colour = Year == 1949), alpha = 0.5) +
theme_bw() +
labs(x = "Julian Day", y = flow_label_instant) +
scale_colour_manual(values = c("black", "red3"), guide = "none")
This is an example of where local and indigenous knowledge would be useful, to further understand flow patterns conveyed by this river. If we believe that the 1949 peak did indeed arise from a distinct process, more data or information would be needed to model separate distributions for each process. In the absence of such information, the overall distribution of flows must be modelled directly.
The first check for stationarity should be a visual inspection. Here is a time series plot of flows.
bgcffa::ams_timeseries(Date, Flow_m3s, data = ams_imputed) +
ylab(flow_label_instant)
The series appears to be stationary, aside from the 1949 outlier. If we believe that the 1949 flow resulted from a different process than the other flows (Section~@ref(homogeneity)), then the entire series is stationary as long as the process driving the 1949 is equally likely to occur on any given year.
This conclusion holds despite the Mann-Kendall test for stationarity suggesting otherwise. Here are the results of the test, with a p-value less than 0.05:
MannKendall(ams_imputed$Flow_m3s)
## tau = -0.163, 2-sided pvalue =0.046305
However, a non-homogeneous population is still stationary. The perceived non-stationarity results by chance that a single one of these alternative processes occurred, and at one end of the series. Removing this flow points towards stationarity by analysis of trend, both visually and inferentially with a large p-value:
ams_imputed |>
filter(Year != 1949) |>
pull(Flow_m3s) |>
MannKendall()
## tau = -0.139, 2-sided pvalue =0.093276
To further understand this, note that the 1949 flow could occur again in the future – say, next in the series – and a downward trend is no longer perceived.
Checking for linear trends in percentiles also shows no detectable trends in the data (this time, only focussing on the main peak flow process post 1949). This is a useful method for detecting non-stationarity in other aspects of the flow distribution, such as variance; however, it is limited to linear trends. The blue straight lines represent the fitted deciles (10th, 20th, …, 90th percentiles).
bgcffa::ams_timeseries(
Date, Flow_m3s, data = filter(annual_instant, Year != 1949)
) +
ylab(flow_label_instant) +
geom_quantile(quantiles = 1:9 / 10)
Inferentially, the trends seen in these lines are likely a result of
the observed sample, since the calculated confidence intervals
(conf.low, conf.high) contain 0 for each
decile (tau).
rq(
Flow_m3s ~ Date, data = filter(annual_instant, Year != 1949), tau = 1:9 / 10
) |>
broom::tidy() |>
filter(term == "Date")
## # A tibble: 9 × 5
## term estimate conf.low conf.high tau
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Date -0.000549 -0.00311 0.00199 0.1
## 2 Date -0.000641 -0.00339 0.00128 0.2
## 3 Date -0.000894 -0.00272 0.00163 0.3
## 4 Date -0.000810 -0.00197 0.000969 0.4
## 5 Date 0.0000832 -0.00334 0.00102 0.5
## 6 Date -0.000717 -0.00318 0.00116 0.6
## 7 Date -0.00180 -0.00369 0.00194 0.7
## 8 Date -0.000205 -0.00379 0.00428 0.8
## 9 Date 0.00196 -0.00677 0.00439 0.9
There is no autocorrelation detected in the series. The blue dashed are expected to contain 95% of the calculated correlations if the series is independence (ignore the first line, which relates each flow to itself).
acf(ams_imputed$Flow_m3s)
One aspect of randomness is missingness. Looking at the time series plot, there is a gap in 2018 that should be investigated. Here is a plot of flows for 2017, 2018, and 2019:
all_daily |>
filter(Year >= 2017, Year <= 2019) |>
ggplot(aes(Date, Flow_m3s)) +
geom_point() +
theme_bw() +
ylab(flow_label_instant)
In this case, the missing flows correspond with the 2018 calendar year, suggesting that the missingness is not due to natural causes (such as a large flood) and therefore missing completely at random.
Take a look at a histogram of flows to get a sense of the distribution shape, and the distribution families that might be appropriate.
(p_hist <- ggplot(ams_imputed, aes(Flow_m3s, after_stat(density))) +
geom_histogram(bins = 20) +
theme_bw() +
xlab(flow_label_instant))
Distribution families with a right-skew are needed to fit these data. Try a GEV, LP3, and Gamma, fit by MLE and L-moments (where possible). Set the models up in a table:
(models <- tribble(
~ family, ~ method,
"gev", "mle",
"gev", "lmom",
"lp3", "lmom",
"gamma", "mle"
) |>
mutate(key = str_c(family, method, sep = "_")))
## # A tibble: 4 × 3
## family method key
## <chr> <chr> <chr>
## 1 gev mle gev_mle
## 2 gev lmom gev_lmom
## 3 lp3 lmom lp3_lmom
## 4 gamma mle gamma_mle
To fit just one model, use the fit_dst() function from
the famish package. Here is a demonstration of the GEV fit
my MLE (the one we will eventually select for use).
flow_distribution <- fit_dst(ams_imputed$Flow_m3s, "gev", method = "mle")
Applying this function to the models in the table, make a new column containing the fitted distributions.
(all_distributions <- models |>
mutate(distribution = map2(family, method, \(f, m) {
fit_dst(ams_imputed$Flow_m3s, name = f, method = m)
})))
## # A tibble: 4 × 4
## family method key distribution
## <chr> <chr> <chr> <list>
## 1 gev mle gev_mle <gev>
## 2 gev lmom gev_lmom <gev>
## 3 lp3 lmom lp3_lmom <lpearsn3>
## 4 gamma mle gamma_mle <gamma>
To compare the distributions’ density functions to the histogram, we
could either plot the densities using stat_function(), or
evaluate the densities on a fine grid and plot with
geom_line(). Using the latter, set up a grid of flows:
flow_grid <- seq(150, 500, length.out = 500)
We can leverage the enframe_*() family of functions from
the probaverse to evaluate the density functions.
enframe_density(flow_distribution, at = flow_grid)
## # A tibble: 500 × 2
## .arg density
## <dbl> <dbl>
## 1 150 0.00293
## 2 151. 0.00306
## 3 151. 0.00320
## 4 152. 0.00334
## 5 153. 0.00349
## 6 154. 0.00363
## 7 154. 0.00377
## 8 155. 0.00392
## 9 156. 0.00407
## 10 156. 0.00422
## # ℹ 490 more rows
all_densities <- all_distributions |>
mutate(df = map(
distribution, enframe_density, at = flow_grid, arg_name = "Flow_m3s"
)) |>
select(!distribution) |>
unnest(df)
p_hist +
geom_line(
data = all_densities,
mapping = aes(Flow_m3s, density, colour = key)
)
Comparing the probability density functions to the histogram, all
distributions look reasonable. This conformation to the data can be
inferentially validated using a distribution test like the
Kolmogorov-Smirnov (ks_pvalue) or Anderson-Darling
(ad_pvalue) tests, both of which result in large p-values,
suggesting they are all realistic models.
all_distributions |>
mutate(
ks_pvalue = map_dbl(distribution, \(d) {
ks.test(annual_instant$Flow_m3s, \(x) eval_cdf(d, at = x))$p.value
}),
ad_pvalue = map_dbl(distribution, \(d) {
ad.test(annual_instant$Flow_m3s, \(x) eval_cdf(d, at = x))$p.value
})
)
## # A tibble: 4 × 6
## family method key distribution ks_pvalue ad_pvalue
## <chr> <chr> <chr> <list> <dbl> <dbl>
## 1 gev mle gev_mle <gev> 0.629 0.597
## 2 gev lmom gev_lmom <gev> 0.569 0.530
## 3 lp3 lmom lp3_lmom <lpearsn3> 0.556 0.516
## 4 gamma mle gamma_mle <gamma> 0.199 0.211
The model screening checked the distributions against the full dataset, but the upper tail of the distribution is what’s most important and what should now determine the final model selection.
Visually, we will compare return period estimates to the data. Return
periods can be estimated from a distribution using
enframe_return() (on either the grid rp_grid
or hand-picked selection rp_interest we defined
earlier).
(all_returns <- all_distributions |>
mutate(df = map(
distribution, enframe_return, at = rp_grid, arg_name = "return_period",
fn_prefix = "Flow_m3s"
)) |>
select(!distribution) |>
unnest(df))
## # A tibble: 2,000 × 5
## family method key return_period Flow_m3s
## <chr> <chr> <chr> <dbl> <dbl>
## 1 gev mle gev_mle 1.01 140.
## 2 gev mle gev_mle 1.02 147.
## 3 gev mle gev_mle 1.04 151.
## 4 gev mle gev_mle 1.05 155.
## 5 gev mle gev_mle 1.06 157.
## 6 gev mle gev_mle 1.07 160.
## 7 gev mle gev_mle 1.09 162.
## 8 gev mle gev_mle 1.10 164.
## 9 gev mle gev_mle 1.12 166.
## 10 gev mle gev_mle 1.13 168.
## # ℹ 1,990 more rows
Compare the distributions to the empirical estimates of return periods.
(empirical_returns <- ams_imputed |>
mutate(return_period = uscore::rpscore(Flow_m3s)))
## # A tibble: 70 × 5
## Year Date Flow_m3s Imputation_flag return_period
## <dbl> <date> <dbl> <chr> <dbl>
## 1 1949 1949-09-02 492. Imputed 140.
## 2 1950 1950-06-22 264. Imputed 6.09
## 3 1951 1951-07-21 183. Imputed 1.31
## 4 1952 1952-07-28 216. Imputed 2.03
## 5 1953 1953-07-04 347. Imputed 46.7
## 6 1954 1954-07-22 160. Imputed 1.10
## 7 1955 1955-07-14 240. Imputed 2.98
## 8 1956 1956-07-28 186. Imputed 1.36
## 9 1957 1957-06-28 275. Imputed 7.37
## 10 1958 1958-07-11 161. Imputed 1.12
## # ℹ 60 more rows
Compare the two in a plot, with the black dots representing the data.
ggplot(all_returns, aes(return_period, Flow_m3s)) +
geom_line(aes(colour = key)) +
geom_point(data = empirical_returns, size = 0.7, alpha = 0.7) +
bgcffa::scale_x_gumbelRP("Return Period (years)") +
ylab(flow_label_instant) +
theme_bw() +
scale_colour_discrete("Model")
The Gamma distribution may not be appropriate due to its incongruent concavity. For the others, the outlier makes the decision difficult. Ideally, the distribution would be built with awareness of the flood process, but in the absence of such information, choosing from the remaining distributions is like splitting hairs.
A quantitative investigation of fit can be conducted through the quantile score, calculated for each return period. This suggests the GEV fit by MLE is the best choice.
all_qscores <- all_returns |>
mutate(
tau = 1 - 1 / return_period,
qscore = map2_dbl(Flow_m3s, tau, \(flow, p) {
mean(quantile_score(ams_imputed$Flow_m3s, flow, tau = p))
})
)
ggplot(all_qscores, aes(return_period, qscore)) +
geom_line(aes(colour = key), alpha = 0.7) +
scale_y_log10("Quantile Score", minor_breaks = log_breaks) +
scale_x_log10(
"Return Period (years)", minor_breaks = log_breaks, limits = c(10, NA)
) +
scale_colour_discrete("Model") +
theme_bw()
The GEV fit by MLE is further appealing because the return period of the 1949 flood event is the smallest, at roughly 500 years. This may still be unrealistic, and local knowledge would be useful to contextualize the frequency of the 1949 flood event. In the absence of such information, if it’s common practice to choose a design flow based on the 100- or 200-year flood, a more pragmatic choice in this case would be to use the 1949 flow, since it actually happened.
Generate 1000 resampled flow datasets, keeping the same sample size as the original.
set.seed(42)
B <- 1000
(bootstrap_distributions <- tibble(resample_id = 1:B) |>
mutate(
distribution = map(resample_id, \(i) {
new_flows <- realise(flow_distribution, n = num_flows)
fit_dst_gev(new_flows, method = "mle")
})
))
## # A tibble: 1,000 × 2
## resample_id distribution
## <int> <list>
## 1 1 <gev>
## 2 2 <gev>
## 3 3 <gev>
## 4 4 <gev>
## 5 5 <gev>
## 6 6 <gev>
## 7 7 <gev>
## 8 8 <gev>
## 9 9 <gev>
## 10 10 <gev>
## # ℹ 990 more rows
Now, the distributions can be evaluated as desired, and their spread represents the distribution for which confidence intervals can be computed. If it means anything to you, this is an estimate of the sampling distribution of the selected property.
For example, we will obtain 95% confidence intervals on a grid of return periods. First, evaluate the quantiles – enframing the quantile function is useful because it keeps track of the inputted return periods instead of just evaluating to the quantiles. Here are the quantile tables for each bootstrap resample.
(bootstrap_rp <- bootstrap_distributions |>
mutate(df = map(
distribution, enframe_return, at = c(2, 5, 10, 20, 50, 100, 200, 500),
arg_name = "return_period", fn_prefix = "Flow_m3s"
)) |>
select(!distribution) |>
unnest(df))
## # A tibble: 8,000 × 3
## resample_id return_period Flow_m3s
## <int> <dbl> <dbl>
## 1 1 2 228.
## 2 1 5 282.
## 3 1 10 316.
## 4 1 20 348.
## 5 1 50 388.
## 6 1 100 418.
## 7 1 200 446.
## 8 1 500 483.
## 9 2 2 217.
## 10 2 5 262.
## # ℹ 7,990 more rows
Let’s take a look at the spread of values corresponding to the 200-year return period. The 2.5% and 97.5% quantiles of these estimates form a 95% confidence interval.
bootstrap_rp |>
filter(return_period == 200) |>
mutate(
lower = quantile(Flow_m3s, 0.025),
upper = quantile(Flow_m3s, 0.975)
) |>
ggplot(aes(Flow_m3s)) +
geom_histogram(bins = 30) +
geom_vline(aes(xintercept = lower), linetype = "dashed", colour = "orange3") +
geom_vline(aes(xintercept = upper), linetype = "dashed", colour = "orange3") +
scale_x_continuous(expression(paste("200-year flood estimates ", (m^3/s)))) +
theme_bw()
Compute the interval for each return period.
(rp_confint <- bootstrap_rp |>
group_by(return_period) |>
summarise(
lower = quantile(Flow_m3s, 0.025),
upper = quantile(Flow_m3s, 0.975)
))
## # A tibble: 8 × 3
## return_period lower upper
## <dbl> <dbl> <dbl>
## 1 2 201. 225.
## 2 5 242. 279.
## 3 10 267. 322.
## 4 20 290. 372.
## 5 50 315. 453.
## 6 100 332. 520.
## 7 200 348. 606.
## 8 500 365. 746.
Using our already selected fit, we can enframe the return periods of our choice:
(return_periods <- enframe_return(
flow_distribution, at = c(2, 5, 10, 20, 50, 100, 200, 500),
arg_name = "return_period", fn_prefix = "Flow_m3s"
))
## # A tibble: 8 × 2
## return_period Flow_m3s
## <dbl> <dbl>
## 1 2 212.
## 2 5 260.
## 3 10 294.
## 4 20 327.
## 5 50 373.
## 6 100 409.
## 7 200 447.
## 8 500 499.
We can join the confidence intervals to indicate sampling uncertainty.
(return_periods2 <- left_join(return_periods, rp_confint, by = "return_period"))
## # A tibble: 8 × 4
## return_period Flow_m3s lower upper
## <dbl> <dbl> <dbl> <dbl>
## 1 2 212. 201. 225.
## 2 5 260. 242. 279.
## 3 10 294. 267. 322.
## 4 20 327. 290. 372.
## 5 50 373. 315. 453.
## 6 100 409. 332. 520.
## 7 200 447. 348. 606.
## 8 500 499. 365. 746.
On a plot with the data, here is what the estimates look like. You may choose to use a finer grid of return periods to make a smoother plot.
ggplot(return_periods2, aes(return_period, Flow_m3s)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5) +
geom_line() +
geom_point(data = empirical_returns, size = 0.7, alpha = 0.7) +
scale_x_gumbelRP("Return Period (years)") +
ylab(flow_label_instant) +
theme_bw()