Dry Corridor - Guatemala StartNetwork Framework

Introduction

This document is the historical analysis for a alteration of the framework that StartNetwork is evaluating.

Monitoring Specs:

Code

box::use(
  dplyr[...],
  tidyr[...],
  janitor[...],
  stringr[...],
  glue[...],
  rlang[...],
  purrr[...],
  gghdx[...],
  ggplot2[...],
  forcats[...],
  sf[...],
  readr[...],
  lubridate[...],
  ggrepel[...],
  gt[...],
  geoarrow[...],
  arrow[...],
  cumulus
)

box::use(
  utils = ../ src/utils/gen_utils
)

gghdx()
txt_label_size <- 3

Code

df_monitor_spec <- tibble::tribble(
    ~`Decision Type`, ~`Decision`,
    "Event Type", "Primera (MJJA) seasonal rainfall deficit",
    "Geographic level of analysis", "Admin 1 level",
    "Forecast Lead Time", "Up to 4 months (March onwards)",
    "Activation thresholds", "1 in 4 year return period rainfall deficit. Measured per lead time, per admin unit",
    "Forecast to use", "March/April: National (INSIVUMEH)\nMay: ECMWF SEAS5"
  )
gt(df_monitor_spec) |> 
  tab_header(
    "StartNetwork Monitoring Specs"
  ) |> 
  tab_options(
       heading.background.color = "#00ad78ff",
    column_labels.background.color = hdx_hex("mint-ultra-light")
  )

StartNetwork Monitoring Specs
Decision Type	Decision
Event Type	Primera (MJJA) seasonal rainfall deficit
Geographic level of analysis	Admin 1 level
Forecast Lead Time	Up to 4 months (March onwards)
Activation thresholds	1 in 4 year return period rainfall deficit. Measured per lead time, per admin unit
Forecast to use	March/April: National (INSIVUMEH) May: ECMWF SEAS5

Code

df_combined_thresholds <- cumulus$blob_read(
    container = "projects",
    name = "ds-aa-lac-dry-corridor/framework_update_2025/df_thresholds_seas5_insivumeh_startnetwork_guatemela.parquet"
  )


# insivumeh zonal stats from parquet on blob - - not ecmwf comes from postgres in code below
df_insivumeh_zonal <- cumulus$blob_read(
  container = "projects",
  name =  "ds-aa-lac-dry-corridor/df_startnetwork_historical_zonal_primera.parquet"
)


df_aoi <- utils$load_aoi_df(version = "startnetwork")

Area of Interest

Below we see the area of interest as a table and a map.

Code

tibble::tribble(
          ~Country,                                                                     ~Departments,
       "Guatemala",                                 "Baja Verapaz & Quiche"
     ) |> 
  gt() |> 
  tab_header(
    "Analysis filtered to admin 1's listed under departments"
  ) |> 
  tab_options(
       heading.background.color = "#00ad78ff",
    column_labels.background.color = hdx_hex("mint-ultra-light")
  )

Analysis filtered to admin 1's listed under departments
Country	Departments
Guatemala	Baja Verapaz & Quiche

Code

# Prep Meta Lookup --------------------------------------------------------
con <- cumulus$pg_con()

tbl_polys <- tbl(con,"polygon") |> 
  filter(adm_level == 1)

df_lookup <- cumulus$blob_load_admin_lookup() |> 
  clean_names()

df_polys <- tbl_polys |> 
  filter(
    iso3 %in% c("GTM")
  ) |> 
  collect() |> 
  clean_names()

df_poly_meta <- df_polys |> 
  left_join(
    df_lookup |> 
      select(iso3,matches("adm\\d"),matches("adm_level")) |> 
      filter(adm_level ==1),
    by = c("iso3" = "iso3","pcode"="adm1_pcode","adm_level")
  )

df_new_aoi_meta <- df_poly_meta |> 
  filter( 
    pcode %in% df_aoi$pcode
    )

Code

# the below doesn't need to be evaluated on render - as I save the outputs of this 
# chunk to blob and load later. Nonetheless, we can keep the code in the notebook.

lgdf_nic <- cumulus$download_fieldmaps_sf(iso3="nic",layer =c("nic_adm1","nic_adm0"))
lgdf_hnd <- cumulus$download_fieldmaps_sf(iso3="hnd",layer=c("hnd_adm1","hnd_adm0"))
lgdf_slv <- cumulus$download_fieldmaps_sf(iso3="slv",layer=c("slv_adm1","slv_adm0"))
lgdf_gtm <- cumulus$download_fieldmaps_sf(iso3="gtm",layer=c("gtm_adm1","gtm_adm0"))


gdf_adm1_cadc <- list_rbind(
  list(
    lgdf_nic$nic_adm1,
    lgdf_hnd$hnd_adm1,
    lgdf_slv$slv_adm1,
    lgdf_gtm$gtm_adm1
  )
) |> 
  rename(
    geometry=geom
  )

gdf_adm0_cadc <- list_rbind(
  list(
    lgdf_nic$nic_adm0,
    lgdf_hnd$hnd_adm0,
    lgdf_slv$slv_adm0,
    lgdf_gtm$gtm_adm0
  )
) |> 
  rename(
    geometry=geom
  )


cumulus$blob_write(
  gdf_adm1_cadc,
  container = "projects",
  name =  "ds-aa-lac-dry-corridor/framework_update_2025/gdf_cadc_adm1.parquet"
  )
cumulus$blob_write(
  gdf_adm0_cadc,
  container = "projects",
  name =  "ds-aa-lac-dry-corridor/framework_update_2025/gdf_cadc_adm0.parquet"
  )

Code

gdf_aoi <- gdf_adm1_cadc |> 
  filter(
    ADM1_PCODE %in% df_new_aoi_meta$pcode
  )
gdf_adm0_gtm <- gdf_adm0_cadc |> 
  filter(ADM0_ES == "Guatemala")


ggplot()+
  geom_sf(data =gdf_adm0_gtm,aes(geometry= geometry), fill ="white")+
  geom_sf(data= gdf_aoi,aes(geometry=geometry), fill = "red")+
  coord_sf(xlim= st_bbox(gdf_adm0_gtm)[c(1,3)],ylim= st_bbox(gdf_adm0_gtm)[c(2,4)], expand = FALSE) +
  theme_void()+
  theme(
    panel.background = element_rect(fill = "darkgrey"),
    panel.grid = element_blank(),
    plot.background = element_rect(fill = "darkgrey")
  )

Aggreate/Analyze Forecast

Code

# First we just look at ECMWF - here we actually consider all leadtimes -- not just those considered for monitoring and we see no activation present in historical record. If we did see activation we would want to check on what lead times those occured.

# quick func to filter historical data based on conditions of framework
filter_historical_to_moments <-  function(df,window_col){
  df |> 
    filter(
      !(month(issued_date)==2 & !!sym(window_col)=="primera"),
      !(month(issued_date) %in% c(5,9) & !!sym(window_col)=="postrera")
    )
    
}
# Prep Seas5 --------------------------------------------------------------

df_seas5 <- tbl(con,"seas5") |> 
  filter(
    adm_level == 1,
    pcode %in% df_aoi$pcode,
    month(valid_date)%in% c(5:11) # just grabbing all relevant months (including postrera) for both windows
  ) |> 
  collect() # this actually loads data into memory so can take a 10s or so.

df_seas5 <- df_seas5 |> 
  mutate(
    precipitation = days_in_month(valid_date) * mean
  )
  

df_primera <- cumulus$seas5_aggregate_forecast(
    df_seas5,
    value = "precipitation",
  valid_months =c(5:8),
  by = c("iso3", "pcode","issued_date")
) |> 
  mutate(
    window = "primera"
  )

df_postrera <- cumulus$seas5_aggregate_forecast(
  df_seas5,
  valid_months =c(9:11),
  value = "precipitation",
  by = c("iso3", "pcode","issued_date")
  ) |> 
  mutate(
    window = "postrera"
  )
df_historical <- bind_rows(
  df_primera,
  df_postrera
)

df_historical_aa <- df_historical |> 
  filter_historical_to_moments(
    window_col ="window"
    )

# startNetwork is monitoring adm1s independent - so no weighted avg

df_historical_aa_adm0 <- df_historical_aa  |> 
  left_join(
    df_new_aoi_meta |> 
      select(
        pcode, iso3, adm1_es = name
      )
    )

INSIVUMEH

Here we look at the historical forecasts from insivumeh. Years that are highlighted indicate that an activation would have occured in either March or April.

Code

df_insivumeh_thresholds <- df_combined_thresholds |> 
  filter(forecast_source == "INSIVUMEH")


df_insivumeh_aa <- df_insivumeh_zonal |> 
  mutate(
    season = "primera"
  ) |> 
  # already done in parquet file -- but not bad to leav here
  filter_historical_to_moments(window_col = "season") |> 
  # rename to use same plotting code above
  rename(
    window = "season"
  )

df_insivumeh_zonal_min_per_year <- df_insivumeh_aa |> 
  left_join(
    df_insivumeh_thresholds |> 
      filter(window == "primera"),
    by = c("adm0_es", "adm1_es","adm1_pcode","leadtime","window")
    ) |> 
  mutate(
    yr_issued = year(issued_date),
    mm_flag = value<=value_empirical,
    priority = if_else(mm_flag, 1, 2) # trick for plotting
  ) |> 
  group_by(adm0_es,adm1_es,window,yr_issued) |> 
  slice_min(
    order_by = tibble(priority,value)
  )

df_insivumeh_zonal_min_per_year |> 
  ggplot(
    aes(x= year(issued_date),y=value, group = 1)
  )+
  geom_point(aes(color = mm_flag), alpha= 0.7, size= 4)+
  geom_line(color = "black")+
  facet_wrap(~adm1_es, ncol=1)+
  scale_color_manual(values = c(hdx_hex("sapphire-hdx"),hdx_hex("tomato-hdx")))+
  labs(
      title = "CADC Guatemala: Primera Season (MJJA) Forecast",
    subtitle = "INSIVUMEH March + April Publication",
    caption = "Points represent minimum rainfall across both leadtimes monitoried in"
  ) +
  geom_text_repel(
    data= filter(df_insivumeh_zonal_min_per_year,mm_flag),
    aes(
      label = year(issued_date)), color = hdx_hex("tomato-hdx"),size= txt_label_size
  )+
  theme(
    axis.title = element_blank(),
    legend.position = "none"
  )

ECMWF

To capture the May activation moment we use ECMWF global seasonal forecast.

Code

df_seas5_thresholds <- df_combined_thresholds |> 
  filter(
    forecast_source =="SEAS5"
  )

df_classified_2024_seas5_thresholds <- df_historical_aa_adm0 |> 
  left_join(df_seas5_thresholds, by = c("iso3","pcode","adm1_es"="name","leadtime","window")) |> 
  mutate(
    precipitation_flag = precipitation <= value_empirical
  )


df_classified_2024_seas5_thresholds |> 
  filter(
    window == "primera",
    leadtime ==0
  ) |> 
  ggplot(
    aes(x= year(issued_date),y=precipitation, group = 1)
  )+
  geom_point(aes(color = precipitation_flag), alpha= 0.7, size= 4)+
  geom_line(color = "black")+
  facet_wrap(~adm1_es, ncol = 1)+
  scale_color_manual(values = c(hdx_hex("sapphire-hdx"),hdx_hex("tomato-hdx")))+
  labs(
    title = "CADC Guatemala: Primera Season (MJJA) Forecast",
    subtitle = "Global ECMWF SEAS5 May Publication",
    caption = "Plotting minimum rainfall at any leadtime monitoried in AA Framework
          Analysis performed at sub-national levelbased on area-weighted average of admins of interest per country"
  ) +
  geom_text_repel(
    data= filter(df_classified_2024_seas5_thresholds,precipitation_flag, window =="primera", leadtime ==0),
    aes(label = year(issued_date)), color = hdx_hex("tomato-hdx"), size=txt_label_size
  )+
  theme(
    axis.title = element_blank(),
    legend.position = "none"
  )

Threshold Tables

Code

df_thresholds_aa <- df_combined_thresholds |> 
  filter(
    # do not use SEAS5 for Guatemala -- only for leadtime 0
      !(iso3 == "GTM" & leadtime %in% c(1:4) & forecast_source == "SEAS5"),
      
      # these 2 months not being moniored
      !(issued_month_label %in% c("Feb","Sep")),
      !(issued_month_label == "May" & window =="postrera")
  ) |> 
    mutate(
    # pull over from seas5 cols
    adm1_es =if_else(is.na(adm1_es),name, adm1_es),
    adm1_pcode =if_else(is.na(adm1_pcode),pcode, adm1_pcode)
  ) 
  

df_thresholds_formatted <- df_thresholds_aa %>% 
  mutate(
    # pull over from seas5 cols
    adm1_es =if_else(is.na(adm1_es),name, adm1_es),
    adm1_pcode =if_else(is.na(adm1_pcode),pcode, adm1_pcode)
  ) |> 
  arrange(
    iso3, window
  ) |> 
  # print(n=15) |> 
  mutate(
    pub_month_int = ifelse(window == "primera",5- leadtime,9-leadtime),
    pub_month_chr = month(pub_month_int, label =T, abb=T),
    pub_month_lt = paste0(pub_month_chr," (", leadtime,")"),
  ) %>% 
  select(
    adm0_es, 
    adm1_es,
    pub_month_lt,
    threshold = value_empirical,
    window,
    forecast_source
  ) 

ldf_thresholds_wide <- split(df_thresholds_formatted,df_thresholds_formatted$window) %>% 
  map(\(dft){
    dft |> 
    arrange(adm0_es) %>% 
        select(-forecast_source,-window) |> 
        filter(
          !str_detect(pub_month_lt, "^Feb")
        ) %>% 
        pivot_wider(names_from = adm1_es,
                    values_from =threshold
        ) |> 
        select(
          pub_month_lt,
          "Baja Verapaz",
          "Quiché" 
        ) %>%
        arrange(
          desc(parse_number(pub_month_lt))
        )
    
  })
        

txt_footnote <- "Threshold calculations are made based on analysis of historical forecast data (1981-2022) to approximate a 1 in 4 year return period drought levels for rainfall over the entire season for each administrative 1 unit respectively. Thresholds are calculated per country, leadtime,admin 1 unit, and forecast data source to minimize potential biases. Where possible national forecasts were used for this analysis and monitoring. Where no national forecasts were readily available, ECMWF seasonal forecasts/historical forecasts were used. <br><br><b> Note:</b> The national forecast provided by INSIVUMEH in Guatemala does not provide a forecast estimate for the month of publication, therefore when that month is included as an activation moment, ECMWF is used."


# set colors for table/legend
# ecmwf_cols <- c("El Salvador","Honduras","Nicaragua")
ecmwf_color <- hdx_hex("mint-light")
insuv_color <- hdx_hex("sapphire-light")


# make legend for table MARS data
gt_legend <- data.frame(
  `Forecast Data Source` = "Forecast Data Source",
  ecmwf = "ECMWF SEAS5" ,
  insuv= "INSIVUMEH"
  
) %>% 
  gt(rowname_col = "Forecast Data Source") %>%  
  data_color(columns = "ecmwf", direction = "column",color=ecmwf_color) %>% 
  data_color(columns = "insuv", direction = "column",color=insuv_color) %>% 
  tab_options(
    column_labels.hidden = T
  ) %>% 
  as_raw_html()

Code

ldf_thresholds_wide$primera %>% 
  gt() %>% 
  fmt_number(decimals = 0) %>% 
  cols_label(
    pub_month_lt = html("Publication Month<br>(leadtime)")
  ) %>% 
  # data_color(columns = ecmwf_cols, direction = "column",color=ecmwf_color) %>% 
  tab_style(
    style = cell_fill(color = insuv_color),
    locations = cells_body(
      columns = c("Baja Verapaz","Quiché"), 
      rows = 1:2
    )) %>% 
  tab_style(
    style = cell_fill(color = ecmwf_color),
    locations = cells_body(
      columns = c("Baja Verapaz","Quiché"), 
      rows = 3
    )) %>% 
  tab_footnote(footnote = html(txt_footnote)) %>% 
  tab_header(title = "Primera (MJJA 2025) Rainfall (mm) Monitoring Thresholds",
             subtitle = gt_legend
  )

Forecast Data Source	ECMWF SEAS5	INSIVUMEH

Mar (2)

755

1,039

Apr (1)

721

1,018

May (0)

803

905

Threshold calculations are made based on analysis of historical forecast data (1981-2022) to approximate a 1 in 4 year return period drought levels for rainfall over the entire season for each administrative 1 unit respectively. Thresholds are calculated per country, leadtime,admin 1 unit, and forecast data source to minimize potential biases. Where possible national forecasts were used for this analysis and monitoring. Where no national forecasts were readily available, ECMWF seasonal forecasts/historical forecasts were used.

Note: The national forecast provided by INSIVUMEH in Guatemala does not provide a forecast estimate for the month of publication, therefore when that month is included as an activation moment, ECMWF is used.

Joint RPs

StartNetwork has expressed that we should implement a 4 year return period based threshold for each administrative unite and lead time separately. Therefore, it is also useful to look at the overall joint return period/activation rate for an activation occurring in any of the lead times and either admin1 area.

Code

# bind together all 
df_all_forecasts <- bind_rows(
  df_insivumeh_aa |> 
    mutate(
      forecast_source = "INSIVUMEH",
      iso3 = "GTM"
    ),
  df_historical_aa_adm0 |> 
    mutate(
      forecast_source = "SEAS5"
    ) |> 
    rename(
      adm1_pcode = "pcode",
      value ="precipitation"
    )
) |> 
  select(-adm0_es)



# classify all forecasts as above or below threshold
df_all_historical_forecasts_classified <- 
  df_all_forecasts |> 
  # select(-adm1_es) |> 
  left_join(
    df_thresholds_aa |> 
      select(
        iso3,
        adm0_es,
        adm1_es, 
        adm1_pcode,
        leadtime, 
        issued_month_label, 
        RP_empirical,
        value_empirical, 
        forecast_source, window)
    # df_combined_thresholds |> 
    #   mutate(
    #         adm1_es =if_else(is.na(adm1_es),name, adm1_es),
    # adm1_pcode =if_else(is.na(adm1_pcode),pcode, adm1_pcode)
    #   )
  ) |> 
    mutate(
    flag = value<= value_empirical
  )

Code

# now filter to only relevant  forecast sources for monitoring

df_historical_forecasts_classified_filtered <- df_all_historical_forecasts_classified |> 
  filter(
    year(issued_date) %in% 1981:2024,
    !(iso3 =="GTM" & leadtime>0 & forecast_source=="SEAS5")
  ) 

# get activation rates per window over all leadtimes
df_jrp_adm <- df_historical_forecasts_classified_filtered |> 
  arrange(
    adm1_pcode,adm1_es, issued_date, leadtime
  ) |> 
  group_by(iso3,adm0_es, yr_issued_date =year(issued_date),window) |> 
  summarise(
    flag = any(flag),
    adm = glue_collapse(unique(adm1_es),sep = "/")
  ) |> 
  group_by(adm0_es,window,adm) |>
  
  summarise(
    # flag = any(flag),
    n = n(),
    nevents = sum(flag),
    years_triggered = glue_collapse(unique(yr_issued_date[flag]),", "),
    ar_weibull = nevents / (n + 1),
    rp_weibull = 1 / ar_weibull,
    ar = mean(flag),
    rp = 1/ar,
    ,
    .groups="drop"
  )

Code

 df_jrp_adm |> 
  gt() |> 
  fmt_percent(starts_with("ar"),decimals = 0) |> 
  fmt_number(starts_with("rp"), decimals=1) |> 
     cols_merge(
    columns = c(`rp_weibull`, `ar_weibull`),
    pattern = "{1} ({2})"
  ) |> 
  cols_label(
    "rp_weibull"= "RP (activation rate)",
     "years_triggered" = "Years triggered",
    "adm0_es"= "Country",
    "adm"= "Province",
    
    ) |> 
  cols_hide(
    c("window","n","nevents","ar","rp")
  ) |> 
  cols_width(
    years_triggered ~ px(200)
  ) |> 
  tab_header(
    title=  "Joint Return Periods/Activation Rate (selected admin 1 AOI)",
    subtitle = "Across all monitored leadtimes and provinces"
  ) |> tab_footnote(footnote = "Analysis years: 1981-2024") |> 
  tab_options(
    heading.background.color = "#00ad78ff",table.font.size = 14,
    column_labels.background.color = hdx_hex("mint-ultra-light"),
  )

Joint Return Periods/Activation Rate (selected admin 1 AOI)
Across all monitored leadtimes and provinces
Country	Province	Years triggered	RP (activation rate)
Guatemala	Quiché/Baja Verapaz	1982, 1985, 1986, 1989, 1991, 1992, 1994, 2002, 2003, 2004, 2009, 2012, 2014, 2015, 2016, 2017, 2018, 2019, 2021	2.4 (42%)
Analysis years: 1981-2024