Payout analysis - Correlated vs Independent Sampling

This document is an exploratory WORK IN PROGRESS.

Intro/Overview.

This is an exploratory document looking at the potential effects/implications of using dependent correlated random sampling vs independent sampling on financial risk exposure analysis of various AA frameworks. For this analysis we look at the Central American Dry Corridor trigger mechanisms subset as opposed to the entire AA portfolio. For this analysis we:

wrangle historical binary activation data into wide format
Extract a correlation matrix
Simulate a sample of 100,000 years that maintains the correlations structure of the original data comparing two methods: copula, ep
Simulate a sample of 100,000 years using independent sampling
Compare the results.

For more details on this analysis see the GitHub repository

box::use(janitor[...])
box::use(dplyr[...])
box::use(purrr[...])
box::use(lubridate[...])
box::use(tidyr[...])
box::use(stringr[...])
box::use(ggplot2[...])
box::use(gghdx[...])
box::use(corrplot[...])
box::use(glue[...])
box::use(gt)

box::use(readr[read_rds])
box::use(forcats[fct_relevel])
box::use(snakecase[to_snake_case])
box::use(AzureStor)
box::use(arrow)

box::use(../R/blob_utils[load_proj_containers])

gghdx()

bc <- load_proj_containers()

AzureStor$download_blob(
  container = bc$PROJECTS_CONT,
  src = "ds-aa-cerf-global-trigger-allocations/df_cadc_historical_classified.parquet",
  dest = tf <-  tempfile(fileext = ".parquet")
)

df <- arrow$read_parquet(tf)

df_historical <-  df |> 
  mutate(
    flag = mm <= q_val
  )

df_payout <-  readr::read_csv(
  file.path(
    "../dashboard/trig_probs_202406_CERF.csv"
  )
)

Data preparation

Below we wrangle the input data a bit and get into the desired format for extracting a correlation matrix.

df_payout_clean <- df_payout |>
  clean_names() |>
  select(
    country,
    shock,
    amount,
    activation_probability_per_year,
    status
  ) |>
  mutate(
    country_trig = to_snake_case(country),
    shock_clean = to_snake_case(shock)
  )|>
  unite(
    event_id,
    c("country_trig", "shock_clean"),
    na.rm = T,
    remove = T
  ) |>
  filter(
    str_detect(
      event_id, "guatemala|el_salvador|nicaragua|honduras"
    )
  )|> 
  mutate(
    event_label = case_when(
      str_detect(event_id, "window_a")~ str_replace_all(event_id,"window_a_drought","primera"),
      str_detect(event_id, "window_b")~ str_replace_all(event_id,"window_b_drought","postera")
    )
  ) |> 
  select(event_id,event_label, amount, prob = activation_probability_per_year, status)

df_thresholded_year <- df_historical |> 
  filter(
    # insiv and ecmwf forecasts have slightly different ranges
    # therefore filter to only include years that are in both
    year(pub_date) %in% c(1981:2022)
  ) |> 
  group_by(
    pub_year=floor_date(pub_date, "year"),
    adm0_es, 
    window
  ) |> 
  summarise(
    flag_year = ifelse(any(flag),1,0), .groups ="drop"
  )

df_probs <- df_thresholded_year |> 
  group_by(
    adm0_es, window
  ) |> 
  summarise(
    prob = mean(flag_year),.groups="drop"
  )

df_activations_wide <- df_thresholded_year |> 
  pivot_wider(
    names_from = c("adm0_es","window"),,
    values_from =flag_year
  ) |> 
  clean_names()

Here is a cleaned up table which shows more or less how we want the data formatted.

df_activations_wide |> 
  mutate(
    pub_year = year(pub_year)
  ) |> 
  gt$gt() |> 
  gt$cols_label(
    pub_year = "Year",
    el_salvador_postera = "SLV\n(postera)",
    el_salvador_primera = "SLV\n(primera)",
    guatemala_postera = "GTM\n(postera)",
    guatemala_primera = "GTM\n(primera)",
    honduras_postera = "HND\n(postera)",
    honduras_primera = "HND\n(primera)",
    nicaragua_postera = "NIC\n(postera)",
    nicaragua_primera = "NIC\n(primera)"
  ) |> 
  # color 1 values mint green
gt$data_color(
    columns = colnames(df_activations_wide)[-1],
    colors = scales::col_numeric(
      palette = c("lightgrey", hdx_hex("mint-hdx")),
      domain = c(0,1)
    )
  ) |> 
  gt$tab_header(
    "Trigger Data Format"
  ) |> 
  gt$tab_options(
    container.overflow.y = TRUE
  )

Trigger Data Format
Year	SLV (postera)	SLV (primera)	GTM (postera)	GTM (primera)	HND (postera)	HND (primera)	NIC (postera)	NIC (primera)
1981	0	0	0	0	0	0	0	0
1982	1	0	1	0	1	0	1	0
1983	0	0	1	0	0	0	0	0
1984	1	0	0	0	1	0	0	0
1985	1	1	0	1	0	1	1	1
1986	1	1	0	1	1	1	1	1
1987	0	0	0	0	0	0	0	0
1988	0	0	0	0	0	0	0	0
1989	0	1	0	1	0	1	0	1
1990	0	1	1	0	0	1	0	1
1991	1	1	1	1	1	1	1	1
1992	1	0	1	1	1	0	1	0
1993	0	0	1	0	0	0	0	0
1994	1	1	1	1	1	1	1	1
1995	0	0	0	0	0	0	0	0
1996	0	0	0	0	0	0	0	0
1997	1	1	0	1	1	1	1	1
1998	0	0	0	0	0	0	0	0
1999	0	0	0	0	0	0	0	0
2000	0	0	0	0	0	0	0	0
2001	0	0	1	0	0	0	0	0
2002	1	0	1	0	1	0	1	0
2003	0	0	0	0	0	0	0	0
2004	1	0	1	0	0	0	0	0
2005	0	0	0	0	0	0	0	0
2006	0	0	0	0	0	0	0	0
2007	0	0	0	0	0	0	0	0
2008	0	0	0	0	0	0	0	0
2009	1	1	1	1	1	1	1	1
2010	0	0	0	0	0	0	0	0
2011	0	0	0	0	0	0	0	0
2012	0	1	1	1	0	0	0	0
2013	0	0	0	0	0	0	0	0
2014	1	1	1	1	1	1	1	1
2015	1	1	1	1	1	1	1	1
2016	0	0	0	1	0	0	0	0
2017	0	0	0	1	0	0	0	0
2018	1	1	1	1	1	1	1	1
2019	0	1	1	1	1	1	0	1
2020	0	0	0	0	0	0	0	0
2021	0	1	0	1	0	1	0	0
2022	0	0	0	0	0	1	0	0

Get desired correlations

We extract the correlations as a correlation matrix. We can see all positive correlations (some very high)

df_activations_matrix <- df_activations_wide |> 
  select(
    -c(pub_year)
  )

corr_matrix <- cor(
  df_activations_matrix
)

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

# http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
corrplot(corr_matrix, 
         method="color",
         col=col(20),  
         type="upper",
         # order="hclust",
         col.lim = c(0.2,1),
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", 
         tl.srt=45, #Text label color and rotation
         sig.level = 0.01, 
         insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=TRUE
)

Simulate correlated data

here we use the {simstudy} package to simulate 100,000 years of data with a correlation structure that should approximate the original data. Here we compare the copula and ep (Emrich Piedmont) methods for generating multivariate correlated binary data. You can see from the plots below that the ep method approximations are much better. This makes sense given limitations of the copula method where information is lost in the processing algorithm which generates continuous data and then converts to binary. Therefore, it makes sense to use the ep method going forward.

set.seed(123)
ldf_sim <- map(
  c("COPULA"="copula","EP"="ep"),
  \(method_name){
    dft <- simstudy::genCorGen(
      n = 100000, 
      nvars = nrow(df_probs), 
      params1 = df_probs$prob,
      # params2 = rep(1.2, nrow(df_probs)), # cannot supply params2 for binary data
      dist = "binary", 
      corMatrix = corr_matrix,
      cnames = colnames(corr_matrix),
      method = method_name, 
      wide = TRUE
    )
    tibble(dft) |> 
      rename(
        year = "id"
      )
    
  }
)

# wrangle the original correlation matrix (actual data)
df_corr_orig_long <- corr_matrix |> 
  data.frame() |> 
  tibble::rownames_to_column(var = "framework") |> 
  pivot_longer(-framework, names_to = "framework_compare", values_to = "correlation") |> 
  mutate(
    type = "original"
  )

lp_compare <- ldf_sim |> 
  imap(
    \(dft,method_name){
      df_sim_matrix <- dft[,-1] # remove year col
      sim_corr_matrix <-  cor(df_sim_matrix) # calc corr matrix of sim data
      
      df_corr_sim_long <- sim_corr_matrix |> 
        data.frame() |> 
        tibble::rownames_to_column(var = "framework") |> 
        pivot_longer(-framework, names_to = "framework_compare", values_to = "correlation") |> 
        mutate(
          type = "simulated"
        )
      df_correlations_compare <- bind_rows(
        df_corr_orig_long,
        df_corr_sim_long
      )
      
      df_correlations_compare |> 
        pivot_wider(
          names_from = type,
          values_from = correlation
        ) |> 
        ggplot(aes(x = original, y = simulated))+
        geom_point(alpha = 0.3)+
        # draw a line with slope of on plot
        geom_abline(intercept = 0, slope = 1, color = "red")+
        # facet_wrap(~framework)+
        theme_minimal()+
        theme(
          axis.text.x = element_text(angle = 45, hjust = 1)
        )+
        labs(
          x= "ORIGINAL: correlation matrix coefficients",
          y = glue("{method_name} SIMULATED: correlation matrix coefficients"),
          title = "Does simulated data have same correlation structure as original?",
          subtitle = glue("{method_name} method")
          
        )
    }
  )

lp_compare$COPULA

lp_compare$EP

Enforce payout logic on correlated sample

Below we enforce the payout/allocation logic where for each country if the trigger is activated in the first window (primera) it is no longer available for the second window (postrera)

df_sim_conditional <- ldf_sim$EP |> 
  pivot_longer(-1,names_to = "event_label") |> 
  left_join(
    # long join w/ 1e6 rows.
    df_payout_clean, 
    by = "event_label"
  ) |> 
  # some hacky stuff bc i got rid of various columns earlier.
  mutate(
    country = str_extract(event_label, "el_salvador|guatemala|nicaragua|honduras"),
    payout = ifelse(value == 1,amount,0)
  ) |> 
  mutate(
    window_num = if_else(str_detect(event_label,"postera"),2,1)
  ) |> 
  arrange(year, country,window_num) |> 
  group_by(
    year, country
  ) |> 
  mutate(
    payout_conditional = if_else(window_num == 2 & lag(window_num)==1 & lag(value)==1,0,payout)
  ) |> 
  ungroup()


df_total_yearly_payout <- df_sim_conditional |> 
  group_by(year) |> 
  summarise(
    payout = sum(payout_conditional)
  )

Simulate data based on independent sample

df_payouts_clean2 <- df_payout_clean %>%
  left_join(
    # The CADC probabilities I calculated are slightly different than the ones put in the excel doc that is
    # used as input data for the previous analysis. The excel doc is probably based on all ECMWF CD forecasts
    # rather than ECMWF + INSIVUMEH. These are correct and I can update the excel doc later. Therefore I 
    # sub these in when running independent sampling
    df_probs |> 
      rename(
        prob_adjust = "prob"
      ) |> 
      mutate(
        # some hacky wrangling to harmonize data sets labels for plotting
        event_label = to_snake_case(paste0(adm0_es,"_", window))
      )
  )

df_sample_indep <- split(df_payouts_clean2,df_payouts_clean2$event_label) |> 
  map(\(dfe){
    event_sampled <- sample(
      # sample either the full amount OR 0
      x = c(dfe$amount, 0),
      # create 10000 samples
      size = 1e5,
      replace = T,
      # Probability defined in spreadsheet, therefore
      # probability of non-activation is just 1- prob
      prob = c(dfe$prob_adjust, 1 - dfe$prob_adjust)
    )
    
    # create df/tibble w/ results
    tibble(
      event_id = dfe$event_id,
      event_label = dfe$event_label,
      payout = event_sampled,
      status = dfe$status # keep in so we can filter later
    )
  }) %>%
  list_rbind() |>
  group_by(event_label) %>%
  mutate(
    year = row_number()
  ) |>
  ungroup()

Enforce payout logic on indepently sampled data

# enforce conditional payouts (i.e cannot have payout in window 2 if payout in window 1)
df_sample_indep_conditonal <-  df_sample_indep |> 
  group_by(
    year
  ) |> 
  mutate(
    country = str_extract(event_id, "el_salvador|guatemala|nicaragua|honduras"),
    window_num = if_else(str_detect(event_id,"window_b"),2,1)
  ) |> 
  arrange(year, country,window_num) |> 
  group_by(
    year, country
  ) |> 
  mutate(
    payout_conditional = if_else(window_num == 2 & lag(window_num)==1 & lag(payout)>0,0,payout)
  ) |> 
  ungroup()


df_indep_total_yearly_payout <- df_sample_indep_conditonal |> 
  ungroup() |> 
  group_by(year) |> 
  summarise(
    payout = sum(payout_conditional)
  )

Results

df_yearly_payouts_by_sampling <- bind_rows(
  df_indep_total_yearly_payout |> 
    mutate(
      sampling = "Independent Sampling"
    ),
  df_total_yearly_payout |> 
    mutate(
      sampling = "Correlated Sampling",
    )
  
) |> 
  ungroup() |> 
  mutate(
    sampling = fct_relevel(sampling, "Independent Sampling","Correlated Sampling")
  )


df_yearly_payouts_by_sampling |> 
  ungroup() |> 
  ggplot(
    aes(x= payout)
  )+
  geom_histogram()+
  facet_wrap(~sampling)+
  scale_x_continuous(labels =scales::label_dollar())+
  scale_y_continuous(labels= scales::label_comma(),
                     limits = c(0,NA),
                     expand = expansion(mult=c(0, 0.01))
  )+
  theme(
    panel.border = element_rect(color = "grey",fill= NA)
  )+
  labs(
    title = "Financial Exposure Distributions of Dry Corridor Portfolio",
    subtitle = "Based on both independent & correlated sampling simulations",
    caption = "Both simulations based on 100k years simulated per event"
  )

Discussion/Conclusion

TBD