box::use(janitor[...])
box::use(dplyr[...])
box::use(purrr[...])
box::use(lubridate[...])
box::use(tidyr[...])
box::use(stringr[...])
box::use(ggplot2[...])
box::use(gghdx[...])
box::use(corrplot[...])
box::use(glue[...])
box::use(gt)
box::use(readr[read_rds])
box::use(forcats[fct_relevel])
box::use(snakecase[to_snake_case])
box::use(AzureStor)
box::use(arrow)
box::use(../R/blob_utils[load_proj_containers])
gghdx()
bc <- load_proj_containers()
AzureStor$download_blob(
container = bc$PROJECTS_CONT,
src = "ds-aa-cerf-global-trigger-allocations/df_cadc_historical_classified.parquet",
dest = tf <- tempfile(fileext = ".parquet")
)
df <- arrow$read_parquet(tf)
df_historical <- df |>
mutate(
flag = mm <= q_val
)
df_payout <- readr::read_csv(
file.path(
"../dashboard/trig_probs_202406_CERF.csv"
)
)This document is an exploratory WORK IN PROGRESS.
Intro/Overview.
This is an exploratory document looking at the potential effects/implications of using dependent correlated random sampling vs independent sampling on financial risk exposure analysis of various AA frameworks. For this analysis we look at the Central American Dry Corridor trigger mechanisms subset as opposed to the entire AA portfolio. For this analysis we:
- wrangle historical binary activation data into wide format
- Extract a correlation matrix
- Simulate a sample of 100,000 years that maintains the correlations structure of the original data comparing two methods:
copula,ep - Simulate a sample of 100,000 years using independent sampling
- Compare the results.
For more details on this analysis see the GitHub repository
Data preparation
Below we wrangle the input data a bit and get into the desired format for extracting a correlation matrix.
df_payout_clean <- df_payout |>
clean_names() |>
select(
country,
shock,
amount,
activation_probability_per_year,
status
) |>
mutate(
country_trig = to_snake_case(country),
shock_clean = to_snake_case(shock)
)|>
unite(
event_id,
c("country_trig", "shock_clean"),
na.rm = T,
remove = T
) |>
filter(
str_detect(
event_id, "guatemala|el_salvador|nicaragua|honduras"
)
)|>
mutate(
event_label = case_when(
str_detect(event_id, "window_a")~ str_replace_all(event_id,"window_a_drought","primera"),
str_detect(event_id, "window_b")~ str_replace_all(event_id,"window_b_drought","postera")
)
) |>
select(event_id,event_label, amount, prob = activation_probability_per_year, status)df_thresholded_year <- df_historical |>
filter(
# insiv and ecmwf forecasts have slightly different ranges
# therefore filter to only include years that are in both
year(pub_date) %in% c(1981:2022)
) |>
group_by(
pub_year=floor_date(pub_date, "year"),
adm0_es,
window
) |>
summarise(
flag_year = ifelse(any(flag),1,0), .groups ="drop"
)
df_probs <- df_thresholded_year |>
group_by(
adm0_es, window
) |>
summarise(
prob = mean(flag_year),.groups="drop"
)
df_activations_wide <- df_thresholded_year |>
pivot_wider(
names_from = c("adm0_es","window"),,
values_from =flag_year
) |>
clean_names() Here is a cleaned up table which shows more or less how we want the data formatted.
df_activations_wide |>
mutate(
pub_year = year(pub_year)
) |>
gt$gt() |>
gt$cols_label(
pub_year = "Year",
el_salvador_postera = "SLV\n(postera)",
el_salvador_primera = "SLV\n(primera)",
guatemala_postera = "GTM\n(postera)",
guatemala_primera = "GTM\n(primera)",
honduras_postera = "HND\n(postera)",
honduras_primera = "HND\n(primera)",
nicaragua_postera = "NIC\n(postera)",
nicaragua_primera = "NIC\n(primera)"
) |>
# color 1 values mint green
gt$data_color(
columns = colnames(df_activations_wide)[-1],
colors = scales::col_numeric(
palette = c("lightgrey", hdx_hex("mint-hdx")),
domain = c(0,1)
)
) |>
gt$tab_header(
"Trigger Data Format"
) |>
gt$tab_options(
container.overflow.y = TRUE
)| Trigger Data Format | ||||||||
|---|---|---|---|---|---|---|---|---|
| Year | SLV (postera) | SLV (primera) | GTM (postera) | GTM (primera) | HND (postera) | HND (primera) | NIC (postera) | NIC (primera) |
| 1981 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1982 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 1983 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1984 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1985 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| 1986 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 |
| 1987 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1988 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1989 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| 1990 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 |
| 1991 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 1992 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| 1993 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1994 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 1995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1996 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1997 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 |
| 1998 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1999 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2001 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2002 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 2003 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2004 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2005 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2006 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2007 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2008 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2009 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2011 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2012 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
| 2013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2014 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2015 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2016 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2017 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2018 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2019 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 |
| 2020 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2021 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 2022 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Get desired correlations
We extract the correlations as a correlation matrix. We can see all positive correlations (some very high)
df_activations_matrix <- df_activations_wide |>
select(
-c(pub_year)
)
corr_matrix <- cor(
df_activations_matrix
)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
# http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
corrplot(corr_matrix,
method="color",
col=col(20),
type="upper",
# order="hclust",
col.lim = c(0.2,1),
addCoef.col = "black", # Add coefficient of correlation
tl.col="black",
tl.srt=45, #Text label color and rotation
sig.level = 0.01,
insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=TRUE
)Simulate data based on independent sample
df_payouts_clean2 <- df_payout_clean %>%
left_join(
# The CADC probabilities I calculated are slightly different than the ones put in the excel doc that is
# used as input data for the previous analysis. The excel doc is probably based on all ECMWF CD forecasts
# rather than ECMWF + INSIVUMEH. These are correct and I can update the excel doc later. Therefore I
# sub these in when running independent sampling
df_probs |>
rename(
prob_adjust = "prob"
) |>
mutate(
# some hacky wrangling to harmonize data sets labels for plotting
event_label = to_snake_case(paste0(adm0_es,"_", window))
)
)
df_sample_indep <- split(df_payouts_clean2,df_payouts_clean2$event_label) |>
map(\(dfe){
event_sampled <- sample(
# sample either the full amount OR 0
x = c(dfe$amount, 0),
# create 10000 samples
size = 1e5,
replace = T,
# Probability defined in spreadsheet, therefore
# probability of non-activation is just 1- prob
prob = c(dfe$prob_adjust, 1 - dfe$prob_adjust)
)
# create df/tibble w/ results
tibble(
event_id = dfe$event_id,
event_label = dfe$event_label,
payout = event_sampled,
status = dfe$status # keep in so we can filter later
)
}) %>%
list_rbind() |>
group_by(event_label) %>%
mutate(
year = row_number()
) |>
ungroup()Enforce payout logic on indepently sampled data
# enforce conditional payouts (i.e cannot have payout in window 2 if payout in window 1)
df_sample_indep_conditonal <- df_sample_indep |>
group_by(
year
) |>
mutate(
country = str_extract(event_id, "el_salvador|guatemala|nicaragua|honduras"),
window_num = if_else(str_detect(event_id,"window_b"),2,1)
) |>
arrange(year, country,window_num) |>
group_by(
year, country
) |>
mutate(
payout_conditional = if_else(window_num == 2 & lag(window_num)==1 & lag(payout)>0,0,payout)
) |>
ungroup()
df_indep_total_yearly_payout <- df_sample_indep_conditonal |>
ungroup() |>
group_by(year) |>
summarise(
payout = sum(payout_conditional)
)Results
df_yearly_payouts_by_sampling <- bind_rows(
df_indep_total_yearly_payout |>
mutate(
sampling = "Independent Sampling"
),
df_total_yearly_payout |>
mutate(
sampling = "Correlated Sampling",
)
) |>
ungroup() |>
mutate(
sampling = fct_relevel(sampling, "Independent Sampling","Correlated Sampling")
)
df_yearly_payouts_by_sampling |>
ungroup() |>
ggplot(
aes(x= payout)
)+
geom_histogram()+
facet_wrap(~sampling)+
scale_x_continuous(labels =scales::label_dollar())+
scale_y_continuous(labels= scales::label_comma(),
limits = c(0,NA),
expand = expansion(mult=c(0, 0.01))
)+
theme(
panel.border = element_rect(color = "grey",fill= NA)
)+
labs(
title = "Financial Exposure Distributions of Dry Corridor Portfolio",
subtitle = "Based on both independent & correlated sampling simulations",
caption = "Both simulations based on 100k years simulated per event"
)Discussion/Conclusion
TBD