Implement CDP Methods

Author

Zack Arno

Intro

Programming some of the code in the CPD workbook into an R workbook to check that I understand it

Code
box::use(
  dplyr[...],
  ggplot2[...],
  lubridate[...],
  tidyr[...],
  stringr[...],
  gghdx[...],
  readr[read_csv,write_csv],
  purrr[...],
  corrplot[...],
  corrr[...],
  stats[qbeta],
  readxl,
  AzureStor,
  ../R/blob_utils[load_proj_containers]
)
gghdx()

bc <- load_proj_containers()
monthly_blobs <- AzureStor$list_blobs(
  bc$PROJECTS_CONT,
  dir = "ds-aa-cerf-global-trigger-allocations/aa_historical/monthly"
  
  )
yearly_blobs <- AzureStor$list_blobs(
  bc$PROJECTS_CONT,
  dir = "ds-aa-cerf-global-trigger-allocations/aa_historical/yearly"
  )

fp <- '/Users/zackarno/Library/CloudStorage/GoogleDrive-Zachary.arno@humdata.org/Shared drives/Data Science/Collaborations/Financial_Exposure_CERF_CDP/OCHA Simulation Tester_for OCHA_20241025.xlsx'
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

# read in simulated data to verify against
df_cdp_sims <- readxl$read_excel(
  fp,
  sheet =1 ,
  range = "L20:AM10020"
  )

# read in correlation groups
df_cdp_stats <- readxl$read_excel(
  fp,
  sheet =1 ,
  range = "M2:AM15"
  )

# slight clean up on cor groups data
df_cor_groups <- df_cdp_stats[nrow(df_cdp_stats),] |> 
  pivot_longer(everything()) |> 
  mutate(
    name = ifelse(name=="CADC - El Salvador (Postera)","CADC - El Salvador (Postrera)",name)
  )



ldf_yearly <- map(
  yearly_blobs$name ,
  \(tb){
    AzureStor$download_blob(
      container = bc$PROJECTS_CONT,
      src = tb,
      dest = tf <-  tempfile(fileext = ".csv")
    )
    read_csv(tf)
  }
)

df_yearly <- ldf_yearly |> 
  map(
    ~.x |> rename(yr_date= 1)
  ) |> 
  reduce(
    full_join, by = "yr_date"
  ) |> 
  arrange(desc(yr_date)) |> 
  mutate(
    across(where(is.numeric), ~as.logical(.x))
  )



ldf_monthly <- map(
  monthly_blobs$name ,
  \(tb){
    AzureStor$download_blob(
      container = bc$PROJECTS_CONT,
      src = tb,
      dest = tf <-  tempfile(fileext = ".csv")
    )
    read_csv(tf)
  }
)
df_monthly <- ldf_monthly |> 
  map(
    ~.x |> rename(yr_month= 1)
  ) |> 
  reduce(
    full_join, by ="yr_month"
  ) |> 
  arrange(desc(yr_month))

Calculate stats/beta params

  • For Lapalace smoothing they simply add 2 years onto each data set with one year activating and the other not
  • From that modified trigger count:
    • alpha = new number of years triggered
    • beta = new number of years not triggered
Code
# probably all not 100% necessay, but let's calculate anyways
df_params <- df_yearly |> 
  pivot_longer(-yr_date) |> 
  group_by(name) |> 
  summarise(
    variance = var(value,na.rm=T),
    number_years = sum(!is.na(value)),
    standard_error = sd(value,na.rm=T)/sqrt(number_years),
    trigger_year = sum(value,na.rm=T),
    trigger_probability = trigger_year/number_years,
    laplace_number_years = number_years +2,
    laplace_alpha = trigger_year +1,
    laplace_beta = laplace_number_years - laplace_alpha,
    laplace_mean = laplace_alpha/laplace_number_years
  ) 

Simualate from Beta Distribution

  • Simulate 10k years of continuous data from beta distribution given the new calculated alpha and beta params
Code
num_simulations <-  10000
sim_beta <-  function(n, alpha, beta){
  replicate(n, qbeta(runif(1),alpha,beta))
}

df_beta_sample_r <- df_params %>%
  group_by(name) |> 
  reframe(
    beta_rand = sim_beta(n=num_simulations, alpha = laplace_alpha, beta =laplace_beta)
  ) |> 
  group_by(name) |> 
  mutate(
    sample_number = row_number()
  ) |> 
  left_join(
    df_cor_groups |> 
      rename(cor_group ="value"),
    by = "name"
    ) 

Convert continuous to logical

  • per group we generate a random number between 0 & 1 and see whether the beta distribution value is above or below
  • if above it counts as triggered, if below not triggered. This is meant to impose a correlation structure on the data by groups and seems to somewhat mirror a sort of manual copula method.
Code
ldf_beta_triggers_simulated <- unique(df_beta_sample_r$cor_group) |> 
  map(\(cor_group_temp){
    df_beta_wide <-
      df_beta_sample_r |> 
      filter(cor_group == cor_group_temp) |> 
      pivot_wider(names_from = name, values_from = beta_rand,names_prefix = "f_") 
      
      df_beta_wide |> 
      group_by(sample_number) |>
      mutate(
        random_number = runif(1)
      ) |>
      pivot_longer(cols = starts_with("f_"), values_to ="beta_random") |> 
      mutate(
        trigger = ifelse(beta_random>random_number,1,0)
      ) |> 
      pivot_wider(
        id_cols = c("sample_number"),
        names_from = name,
        values_from = trigger
      ) |> ungroup()
  })

df_beta_triggers_simulated <- reduce(ldf_beta_triggers_simulated ,left_join, "sample_number")
Code
df_beta_triggers_simulated |> 
  glimpse()


cadc_postrera_cols <- str_subset(colnames(df_beta_triggers_simulated), "Postrera")
cadc_primera_cols <- str_subset(colnames(df_beta_triggers_simulated), "Primera")

condition_postrera <- function(df,col){
  primera_col = str_replace(col,"Postrera","Primera")
  df |> 
    transmute(
      !!col := ifelse(!!sym(primera_col)==1,0,!!sym(col))
    )
  
}

postrera_conditoned <- cadc_postrera_cols |> 
  map(
    \(col_temp){
      condition_postrera(
        df_beta_triggers_simulated,
        col= col_temp
        )
    }
  ) |> 
  list_cbind()

df_beta_triggers_simulated <- df_beta_triggers_simulated |> 
  select(- contains("Postrera")) |> 
  cbind(postrera_conditoned)

# df_payouts_all_conditioned_wide <- df_beta_triggers_simulated |>
#   mutate(
#     `f_CADC - Guatemala (Postrera)` = ifelse(`f_CADC - Guatemala (Primera)` == 1, 0, ),w
#   )
#     # across(c("bangladesh_padma_floods", "bangladesh_jamuna_floods"), ~ ifelse(bangladesh_storms > 0, 0, .x)),
#     burkina_faso_observational_drought = ifelse(burkina_faso_predictive_1_drought > 0, 0, burkina_faso_observational_drought),
#     niger_observational_drought = ifelse(niger_predictive_2_drought > 0, 0, niger_observational_drought),
#     guatemala_window_b_drought = ifelse(guatemala_window_a_drought > 0, 0, guatemala_window_b_drought),
#     nicaragua_window_b_drought = ifelse(nicaragua_window_a_drought > 0, 0, nicaragua_window_b_drought),
#     el_salvador_window_b_drought = ifelse(el_salvador_window_a_drought > 0, 0, el_salvador_window_b_drought),
#     honduras_window_b_drought = ifelse(honduras_window_a_drought > 0, 0, honduras_window_b_drought),
#   ) |>
#   rowwise() |>
#   mutate(
#     philippines_max = max(philippines_scenario_1_storms, philippines_scenario_2_storms)
#   ) |>
#   ungroup()
Code
# convert CPD allocation simulation to integer (1 = triggerd,0 = not triggered)
df_cdp_sims_int <- df_cdp_sims |> 
  mutate(
    `Sample Number` = as.character(`Sample Number`),

    across(where(is.numeric),~as.integer(.x>0)),
    # sample_number = row_number(),
    .before= everything()
  )

Compute Correlation Matrices

Code
# compute correlation matrices for both methods

# R
df_cor_sim_r <- correlate(df_beta_triggers_simulated[-1])


# Excel
df_cor_sim_excel <- correlate(
  df_cdp_sims_int[-1] |> 
    # little cleaning
    rename(
      "CADC - El Salvador (Postrera)" ="CADC - El Salvador (Postera)" 
    )
  )


excel_long <- df_cor_sim_excel |> 
  # shave() |> 
  pivot_longer(-term) |> 
  mutate(
    method = "excel"
  )

r_long <- df_cor_sim_r |> 
  # shave() |> 
  pivot_longer(-term) |> 
  mutate(
    method = "r",
    across(c("term","name"),~str_remove(.x,"^f_")),
    method = "R",
  ) 
  
df_combined_methods <- bind_rows(
  excel_long, r_long
)

The methods look to be lining up, the reason for difference is probably that I didn’t enforce ALL of the payout logic in this analysis (just CADC). That would be good to check, but I believe at this point I think I’ve reached the goal of understanding the method.

Code
ggplot(df_combined_methods,
       aes(x = term, y = name, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(midpoint = 0.5, mid ="grey70", limits = c(-1, +1),na.value ="white") +
  labs(title = "Correlation matrices of simulated trigger activations by method", 
  x = "", y = "", fill = "Correlation \n Measure") +
  facet_wrap(~method)+
  theme(
    axis.text.x = element_text(angle=90),
    axis.text = element_text(size=8)
  )