Intro
Programming some of the code in the CPD workbook into an R workbook to check that I understand it
Code
box:: use (
dplyr[...],
ggplot2[...],
lubridate[...],
tidyr[...],
stringr[...],
gghdx[...],
readr[read_csv,write_csv],
purrr[...],
corrplot[...],
corrr[...],
stats[qbeta],
readxl,
AzureStor,
../ R/ blob_utils[load_proj_containers]
)
gghdx ()
bc <- load_proj_containers ()
monthly_blobs <- AzureStor$ list_blobs (
bc$ PROJECTS_CONT,
dir = "ds-aa-cerf-global-trigger-allocations/aa_historical/monthly"
)
yearly_blobs <- AzureStor$ list_blobs (
bc$ PROJECTS_CONT,
dir = "ds-aa-cerf-global-trigger-allocations/aa_historical/yearly"
)
fp <- '/Users/zackarno/Library/CloudStorage/GoogleDrive-Zachary.arno@humdata.org/Shared drives/Data Science/Collaborations/Financial_Exposure_CERF_CDP/OCHA Simulation Tester_for OCHA_20241025.xlsx'
col <- colorRampPalette (c ("#BB4444" , "#EE9988" , "#FFFFFF" , "#77AADD" , "#4477AA" ))
# read in simulated data to verify against
df_cdp_sims <- readxl$ read_excel (
fp,
sheet = 1 ,
range = "L20:AM10020"
)
# read in correlation groups
df_cdp_stats <- readxl$ read_excel (
fp,
sheet = 1 ,
range = "M2:AM15"
)
# slight clean up on cor groups data
df_cor_groups <- df_cdp_stats[nrow (df_cdp_stats),] |>
pivot_longer (everything ()) |>
mutate (
name = ifelse (name== "CADC - El Salvador (Postera)" ,"CADC - El Salvador (Postrera)" ,name)
)
ldf_yearly <- map (
yearly_blobs$ name ,
\(tb){
AzureStor$ download_blob (
container = bc$ PROJECTS_CONT,
src = tb,
dest = tf <- tempfile (fileext = ".csv" )
)
read_csv (tf)
}
)
df_yearly <- ldf_yearly |>
map (
~ .x |> rename (yr_date= 1 )
) |>
reduce (
full_join, by = "yr_date"
) |>
arrange (desc (yr_date)) |>
mutate (
across (where (is.numeric), ~ as.logical (.x))
)
ldf_monthly <- map (
monthly_blobs$ name ,
\(tb){
AzureStor$ download_blob (
container = bc$ PROJECTS_CONT,
src = tb,
dest = tf <- tempfile (fileext = ".csv" )
)
read_csv (tf)
}
)
df_monthly <- ldf_monthly |>
map (
~ .x |> rename (yr_month= 1 )
) |>
reduce (
full_join, by = "yr_month"
) |>
arrange (desc (yr_month))
Calculate stats/beta params
For Lapalace smoothing they simply add 2 years onto each data set with one year activating and the other not
From that modified trigger count:
alpha = new number of years triggered
beta = new number of years not triggered
Code
# probably all not 100% necessay, but let's calculate anyways
df_params <- df_yearly |>
pivot_longer (- yr_date) |>
group_by (name) |>
summarise (
variance = var (value,na.rm= T),
number_years = sum (! is.na (value)),
standard_error = sd (value,na.rm= T)/ sqrt (number_years),
trigger_year = sum (value,na.rm= T),
trigger_probability = trigger_year/ number_years,
laplace_number_years = number_years + 2 ,
laplace_alpha = trigger_year + 1 ,
laplace_beta = laplace_number_years - laplace_alpha,
laplace_mean = laplace_alpha/ laplace_number_years
)
Simualate from Beta Distribution
Simulate 10k years of continuous data from beta distribution given the new calculated alpha and beta params
Code
num_simulations <- 10000
sim_beta <- function (n, alpha, beta){
replicate (n, qbeta (runif (1 ),alpha,beta))
}
df_beta_sample_r <- df_params %>%
group_by (name) |>
reframe (
beta_rand = sim_beta (n= num_simulations, alpha = laplace_alpha, beta = laplace_beta)
) |>
group_by (name) |>
mutate (
sample_number = row_number ()
) |>
left_join (
df_cor_groups |>
rename (cor_group = "value" ),
by = "name"
)
Convert continuous to logical
per group we generate a random number between 0 & 1 and see whether the beta distribution value is above or below
if above it counts as triggered, if below not triggered. This is meant to impose a correlation structure on the data by groups and seems to somewhat mirror a sort of manual copula method.
Code
ldf_beta_triggers_simulated <- unique (df_beta_sample_r$ cor_group) |>
map (\(cor_group_temp){
df_beta_wide <-
df_beta_sample_r |>
filter (cor_group == cor_group_temp) |>
pivot_wider (names_from = name, values_from = beta_rand,names_prefix = "f_" )
df_beta_wide |>
group_by (sample_number) |>
mutate (
random_number = runif (1 )
) |>
pivot_longer (cols = starts_with ("f_" ), values_to = "beta_random" ) |>
mutate (
trigger = ifelse (beta_random> random_number,1 ,0 )
) |>
pivot_wider (
id_cols = c ("sample_number" ),
names_from = name,
values_from = trigger
) |> ungroup ()
})
df_beta_triggers_simulated <- reduce (ldf_beta_triggers_simulated ,left_join, "sample_number" )
Code
df_beta_triggers_simulated |>
glimpse ()
cadc_postrera_cols <- str_subset (colnames (df_beta_triggers_simulated), "Postrera" )
cadc_primera_cols <- str_subset (colnames (df_beta_triggers_simulated), "Primera" )
condition_postrera <- function (df,col){
primera_col = str_replace (col,"Postrera" ,"Primera" )
df |>
transmute (
!! col := ifelse (!! sym (primera_col)== 1 ,0 ,!! sym (col))
)
}
postrera_conditoned <- cadc_postrera_cols |>
map (
\(col_temp){
condition_postrera (
df_beta_triggers_simulated,
col= col_temp
)
}
) |>
list_cbind ()
df_beta_triggers_simulated <- df_beta_triggers_simulated |>
select (- contains ("Postrera" )) |>
cbind (postrera_conditoned)
# df_payouts_all_conditioned_wide <- df_beta_triggers_simulated |>
# mutate(
# `f_CADC - Guatemala (Postrera)` = ifelse(`f_CADC - Guatemala (Primera)` == 1, 0, ),w
# )
# # across(c("bangladesh_padma_floods", "bangladesh_jamuna_floods"), ~ ifelse(bangladesh_storms > 0, 0, .x)),
# burkina_faso_observational_drought = ifelse(burkina_faso_predictive_1_drought > 0, 0, burkina_faso_observational_drought),
# niger_observational_drought = ifelse(niger_predictive_2_drought > 0, 0, niger_observational_drought),
# guatemala_window_b_drought = ifelse(guatemala_window_a_drought > 0, 0, guatemala_window_b_drought),
# nicaragua_window_b_drought = ifelse(nicaragua_window_a_drought > 0, 0, nicaragua_window_b_drought),
# el_salvador_window_b_drought = ifelse(el_salvador_window_a_drought > 0, 0, el_salvador_window_b_drought),
# honduras_window_b_drought = ifelse(honduras_window_a_drought > 0, 0, honduras_window_b_drought),
# ) |>
# rowwise() |>
# mutate(
# philippines_max = max(philippines_scenario_1_storms, philippines_scenario_2_storms)
# ) |>
# ungroup()
Code
# convert CPD allocation simulation to integer (1 = triggerd,0 = not triggered)
df_cdp_sims_int <- df_cdp_sims |>
mutate (
` Sample Number ` = as.character (` Sample Number ` ),
across (where (is.numeric),~ as.integer (.x> 0 )),
# sample_number = row_number(),
.before= everything ()
)
Compute Correlation Matrices
Code
# compute correlation matrices for both methods
# R
df_cor_sim_r <- correlate (df_beta_triggers_simulated[- 1 ])
# Excel
df_cor_sim_excel <- correlate (
df_cdp_sims_int[- 1 ] |>
# little cleaning
rename (
"CADC - El Salvador (Postrera)" = "CADC - El Salvador (Postera)"
)
)
excel_long <- df_cor_sim_excel |>
# shave() |>
pivot_longer (- term) |>
mutate (
method = "excel"
)
r_long <- df_cor_sim_r |>
# shave() |>
pivot_longer (- term) |>
mutate (
method = "r" ,
across (c ("term" ,"name" ),~ str_remove (.x,"^f_" )),
method = "R" ,
)
df_combined_methods <- bind_rows (
excel_long, r_long
)
The methods look to be lining up, the reason for difference is probably that I didn’t enforce ALL of the payout logic in this analysis (just CADC). That would be good to check, but I believe at this point I think I’ve reached the goal of understanding the method.
Code
ggplot (df_combined_methods,
aes (x = term, y = name, fill = value)) +
geom_tile () +
scale_fill_gradient2 (midpoint = 0.5 , mid = "grey70" , limits = c (- 1 , + 1 ),na.value = "white" ) +
labs (title = "Correlation matrices of simulated trigger activations by method" ,
x = "" , y = "" , fill = "Correlation \n Measure" ) +
facet_wrap (~ method)+
theme (
axis.text.x = element_text (angle= 90 ),
axis.text = element_text (size= 8 )
)