Load Data and Packages

CBG-fire and estab-fire data

cbg_fire <- read_dta("baseline_sample_cells.dta") %>%
  mutate(
    cbg = str_pad(as.character(geoid), 12, pad = "0"),
    Fire_ID        = factor(fire_id)) %>%
  select(-c(geoid, fire_id, archive_version_year))

## check if a cbg has only 1 treat value thruought time
# distinct_counts <- cbg_fire %>%
#   group_by(Fire_ID, cbg) %>%
#   summarise(n_treat_levels = n_distinct(treat), .groups = "drop")
    
panel_regression_alarm <- read_csv("estab_fire_month_alarm_panel.csv") %>%
  select(c("placekey","Fire_ID","m_from_alarm","alarm_date","end_date","poi_cbg","ym","visits_month","visitors_month", "POST_alarm","month_idx", "band")) %>%
  mutate(
    cbg = str_pad(as.character(poi_cbg), 12, pad = "0"),
    Fire_ID        = factor(Fire_ID),
    month_idx   = as.integer(factor(month_idx, levels = sort(unique(month_idx))))
  )%>%
  select(-poi_cbg)

## Rows: 1495195 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): placekey, locatoion_name, street_address, city, state, band, geom
## dbl  (59): naics_code, postal_code, Fire_ID, dist_to_fire_km, poi_cbg, visit...
## date  (5): alarm_date, end_date, ym, alarm_m, ym_fac
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

panel_regression_cont <- read_csv("estab_fire_month_cont_panel.csv") %>%
  select(c("placekey","Fire_ID","m_from_contain","alarm_date","end_date","poi_cbg","ym","visits_month","visitors_month", "POST_cont","month_idx", "band")) %>%
  mutate(
    cbg = str_pad(as.character(poi_cbg), 12, pad = "0"),
    Fire_ID        = factor(Fire_ID),
    month_idx   = as.integer(factor(month_idx, levels = sort(unique(month_idx))))
  )%>%
  select(-poi_cbg)

## Rows: 1439908 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): placekey, locatoion_name, street_address, city, state, band, geom
## dbl  (60): Fire_ID, poi_cbg, visits_month, visitors_month, m_from_contain, n...
## date  (6): ym, contain_m, ym_fac, alarm_date, end_date, alarm_m
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

join cbg-fire to estab-fire

1) classify groups and get a cbg-level treat when it’s uniform

cbg_status <- cbg_fire %>%
  group_by(Fire_ID, cbg) %>%
  summarise(
    has0 = any(treat == 0, na.rm = TRUE),
    has1 = any(treat == 1, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    partial   = has0 & has1,
    cbg_treat = case_when(has1 & !has0 ~ 1L,
                          has0 & !has1 ~ 0L,
                          TRUE ~ NA_integer_) 
  ) %>%
  select(Fire_ID, cbg, partial, cbg_treat)

2) attach to estab and decide row-level treat to use

panel_alarm <- panel_regression_alarm %>%
  left_join(cbg_status, by = c("Fire_ID","cbg")) %>%
  mutate(
    treat_pick = if_else(partial,
                         if_else(band == "inside", 1L, 0L),
                         cbg_treat)
  )

# 3) Bring columns from cbg_fire for the chosen treat
#    If cbg_fire has multiple rows per (Fire_ID, cbg, treat), pick one (e.g., first or min km).
cbg_fire_uniq <- cbg_fire %>%
  arrange(Fire_ID, cbg, treat) %>%
  distinct(Fire_ID, cbg, treat, .keep_all = TRUE)

cbg_alarm <- panel_alarm %>%
  left_join(cbg_fire_uniq, by = c("Fire_ID","cbg","treat_pick" = "treat"))

print(cbg_alarm %>%
        filter(is.na(treat_pick)))

## # A tibble: 228,467 × 16
##    placekey   Fire_ID m_from_alarm alarm_date end_date   ym         visits_month
##    <chr>      <fct>          <dbl> <date>     <date>     <date>            <dbl>
##  1 22c-222@5… 20850             -9 2019-10-31 2019-11-05 2019-01-01           10
##  2 22c-222@5… 20850             -8 2019-10-31 2019-11-05 2019-02-01           13
##  3 22c-222@5… 20850             -7 2019-10-31 2019-11-05 2019-03-01           15
##  4 22c-222@5… 20850             -6 2019-10-31 2019-11-05 2019-04-01           22
##  5 22c-222@5… 20850             -5 2019-10-31 2019-11-05 2019-05-01           29
##  6 22c-222@5… 20850             -4 2019-10-31 2019-11-05 2019-06-01           16
##  7 22c-222@5… 20850             -3 2019-10-31 2019-11-05 2019-07-01           28
##  8 22c-222@5… 20850             -2 2019-10-31 2019-11-05 2019-08-01           33
##  9 22c-222@5… 20850             -1 2019-10-31 2019-11-05 2019-09-01           39
## 10 22c-222@5… 20850              0 2019-10-31 2019-11-05 2019-10-01           35
## # ℹ 228,457 more rows
## # ℹ 9 more variables: visitors_month <dbl>, POST_alarm <dbl>, month_idx <int>,
## #   band <chr>, cbg <chr>, partial <lgl>, cbg_treat <int>, treat_pick <dbl>,
## #   km_to_fire <dbl>

There are estab with NA CBG level info.

NA Fields diagnostic

— 1) Diagnose why rows won’t match (this is why see NAs) ————-

unmatched <- anti_join(panel_regression_alarm, cbg_status, by = c("Fire_ID","cbg"))
nrow(unmatched)           # how many will become NA?

## [1] 228467

head(unmatched, 10)       # inspect a few: wrong cbg length? different Fire_ID?

## # A tibble: 10 × 12
##    placekey   Fire_ID m_from_alarm alarm_date end_date   ym         visits_month
##    <chr>      <fct>          <dbl> <date>     <date>     <date>            <dbl>
##  1 22c-222@5… 20850             -9 2019-10-31 2019-11-05 2019-01-01           10
##  2 22c-222@5… 20850             -8 2019-10-31 2019-11-05 2019-02-01           13
##  3 22c-222@5… 20850             -7 2019-10-31 2019-11-05 2019-03-01           15
##  4 22c-222@5… 20850             -6 2019-10-31 2019-11-05 2019-04-01           22
##  5 22c-222@5… 20850             -5 2019-10-31 2019-11-05 2019-05-01           29
##  6 22c-222@5… 20850             -4 2019-10-31 2019-11-05 2019-06-01           16
##  7 22c-222@5… 20850             -3 2019-10-31 2019-11-05 2019-07-01           28
##  8 22c-222@5… 20850             -2 2019-10-31 2019-11-05 2019-08-01           33
##  9 22c-222@5… 20850             -1 2019-10-31 2019-11-05 2019-09-01           39
## 10 22c-222@5… 20850              0 2019-10-31 2019-11-05 2019-10-01           35
## # ℹ 5 more variables: visitors_month <dbl>, POST_alarm <dbl>, month_idx <int>,
## #   band <chr>, cbg <chr>

2) simple breakdown: missing keys vs pair-not-found

pairs_cbg_fire <- cbg_fire %>% distinct(Fire_ID, cbg)

# rows in estab that won't match the (Fire_ID, cbg) pairs in cbg_fire
unmatched <- panel_regression_alarm %>% anti_join(pairs_cbg_fire, by = c("Fire_ID","cbg"))

unmatched_simple <- unmatched %>%
  mutate(reason = case_when(
    is.na(Fire_ID) & is.na(cbg) ~ "missing_both",
    is.na(Fire_ID)              ~ "missing_Fire_ID",
    is.na(cbg)                  ~ "missing_cbg",
    TRUE                        ~ "(Fire_ID, cbg) not in cbg_fire"
  )) %>%
  count(reason, name = "n") %>%
  arrange(desc(n))

unmatched_simple

## # A tibble: 1 × 2
##   reason                              n
##   <chr>                           <int>
## 1 (Fire_ID, cbg) not in cbg_fire 228467

3) deep dive: missing keys vs pair-not-found

fire_set <- unique(pairs_cbg_fire$Fire_ID)
cbg_set  <- unique(pairs_cbg_fire$cbg)

unmatched_detailed <- unmatched %>%
  filter(!is.na(Fire_ID), !is.na(cbg)) %>%
  mutate(
    Fire_ID_in_cbg_fire = Fire_ID %in% fire_set,
    cbg_in_cbg_fire     = cbg %in% cbg_set,
    detail = case_when(
      Fire_ID_in_cbg_fire & cbg_in_cbg_fire ~ "both exist separately, pair missing",
      Fire_ID_in_cbg_fire & !cbg_in_cbg_fire ~ "Fire_ID exists, cbg absent globally",
      !Fire_ID_in_cbg_fire & cbg_in_cbg_fire ~ "cbg exists, Fire_ID absent globally",
      TRUE ~ "neither key exists globally"
    )
  ) %>%
  count(detail, name = "n") %>%
  arrange(desc(n))

unmatched_detailed

## # A tibble: 4 × 2
##   detail                                   n
##   <chr>                                <int>
## 1 cbg exists, Fire_ID absent globally 160164
## 2 both exist separately, pair missing  48444
## 3 neither key exists globally          14525
## 4 Fire_ID exists, cbg absent globally   5334

4) Missing keys from CBG-fire (Daniel’s) by category and what they are

# Distinct key set from cbg_fire
pairs_cbg_fire <- cbg_fire %>% distinct(Fire_ID, cbg)
fire_set <- unique(pairs_cbg_fire$Fire_ID)
cbg_set  <- unique(pairs_cbg_fire$cbg)

# Rows in estab that don't match any (Fire_ID, cbg) pair in cbg_fire
unmatched <- panel_regression_alarm %>% anti_join(pairs_cbg_fire, by = c("Fire_ID","cbg"))
unmatched_nonmissing <- unmatched %>% filter(!is.na(Fire_ID), !is.na(cbg))

# 1) Distinct Fire_IDs absent globally (and what they are)
absent_fire_ids <- unmatched %>%
  filter(!is.na(Fire_ID), !(Fire_ID %in% fire_set)) %>%
  distinct(Fire_ID) %>%
  arrange(Fire_ID)
n_absent_fire_ids <- nrow(absent_fire_ids)

# 2) Distinct CBGs absent globally (and what they are)
absent_cbgs <- unmatched %>%
  filter(!is.na(cbg), !(cbg %in% cbg_set)) %>%
  distinct(cbg) %>%
  arrange(cbg)
n_absent_cbgs <- nrow(absent_cbgs)

# 3) Distinct pairs missing (both keys exist separately but the pair is not present)
pairs_missing <- unmatched_nonmissing %>%
  filter(Fire_ID %in% fire_set, cbg %in% cbg_set) %>%
  distinct(Fire_ID, cbg) %>%
  arrange(Fire_ID, cbg)
n_pairs_missing <- nrow(pairs_missing)

# 4) “Neither exist globally” category — distinct Fire_IDs and CBGs (and what they are)
neither_exist <- unmatched_nonmissing %>%
  filter(!(Fire_ID %in% fire_set) & !(cbg %in% cbg_set))

neither_fire_ids <- neither_exist %>% distinct(Fire_ID) %>% arrange(Fire_ID)
n_neither_fire_ids <- nrow(neither_fire_ids)

neither_cbgs <- neither_exist %>% distinct(cbg) %>% arrange(cbg)
n_neither_cbgs <- nrow(neither_cbgs)

# --- Summary objects to view/print ---
list(
  n_absent_fire_ids = n_absent_fire_ids,
  absent_fire_ids = absent_fire_ids,
  n_absent_cbgs = n_absent_cbgs,
  absent_cbgs = absent_cbgs,
  n_pairs_missing = n_pairs_missing,
  pairs_missing = pairs_missing,
  n_neither_fire_ids = n_neither_fire_ids,
  neither_fire_ids = neither_fire_ids,
  n_neither_cbgs = n_neither_cbgs,
  neither_cbgs = neither_cbgs
)

## $n_absent_fire_ids
## [1] 4
## 
## $absent_fire_ids
## # A tibble: 4 × 1
##   Fire_ID
##   <fct>  
## 1 183    
## 2 21017  
## 3 21150  
## 4 21341  
## 
## $n_absent_cbgs
## [1] 46
## 
## $absent_cbgs
## # A tibble: 46 × 1
##    cbg         
##    <chr>       
##  1 060133131031
##  2 060133551123
##  3 060170306022
##  4 060170306024
##  5 060170306031
##  6 060350403031
##  7 060350403051
##  8 060374303012
##  9 060450101002
## 10 060450101003
## # ℹ 36 more rows
## 
## $n_pairs_missing
## [1] 193
## 
## $pairs_missing
## # A tibble: 193 × 2
##    Fire_ID cbg         
##    <fct>   <chr>       
##  1 14      060570003002
##  2 14      060570004021
##  3 14      060570004024
##  4 14      060570007022
##  5 14      060570009002
##  6 40      060590219151
##  7 40      060590219152
##  8 40      060590219211
##  9 40      060590219241
## 10 40      060590320111
## # ℹ 183 more rows
## 
## $n_neither_fire_ids
## [1] 2
## 
## $neither_fire_ids
## # A tibble: 2 × 1
##   Fire_ID
##   <fct>  
## 1 183    
## 2 21017  
## 
## $n_neither_cbgs
## [1] 26
## 
## $neither_cbgs
## # A tibble: 26 × 1
##    cbg         
##    <chr>       
##  1 060450108011
##  2 060650406072
##  3 060650406151
##  4 060650406152
##  5 060650406162
##  6 060650407022
##  7 060650407031
##  8 060650407032
##  9 060650408131
## 10 060650408141
## # ℹ 16 more rows

5) Missing keys from estab-yearmonth-fire (Mine) by category and what they are

# --- distinct key sets ------------------------------------------------------
pairs_cbg_fire  <- cbg_fire %>% distinct(Fire_ID, cbg)
pairs_estab     <- panel_regression_alarm %>% distinct(Fire_ID, cbg)

estab_fire_set  <- unique(pairs_estab$Fire_ID)
estab_cbg_set   <- unique(pairs_estab$cbg)

# Pairs that are in cbg_fire but NOT in estab_yearmonth_fire
unmatched_from_cbgfire <- anti_join(pairs_cbg_fire, pairs_estab, by = c("Fire_ID","cbg"))

# --- categories & outputs ---------------------------------------------------
# 1) Fire_IDs in cbg_fire absent globally from estab
absent_fire_ids_in_estab <- unmatched_from_cbgfire %>%
  filter(!(Fire_ID %in% estab_fire_set)) %>%
  distinct(Fire_ID) %>% arrange(Fire_ID)

# 2) CBGs in cbg_fire absent globally from estab
absent_cbgs_in_estab <- unmatched_from_cbgfire %>%
  filter(!(cbg %in% estab_cbg_set)) %>%
  distinct(cbg) %>% arrange(cbg)

# 3) Pairs missing where both keys exist separately in estab
pairs_missing_in_estab <- unmatched_from_cbgfire %>%
  filter(Fire_ID %in% estab_fire_set, cbg %in% estab_cbg_set) %>%
  distinct(Fire_ID, cbg) %>% arrange(Fire_ID, cbg)

# 4) “Neither key exists” in estab (for these cbg_fire pairs)
neither_exist_in_estab <- unmatched_from_cbgfire %>%
  filter(!(Fire_ID %in% estab_fire_set) & !(cbg %in% estab_cbg_set))

neither_fire_ids_in_estab <- neither_exist_in_estab %>%
  distinct(Fire_ID) %>% arrange(Fire_ID)

neither_cbgs_in_estab <- neither_exist_in_estab %>%
  distinct(cbg) %>% arrange(cbg)

# --- compact summary you can print/view -------------------------------------
summary_checks <- list(
  n_absent_fire_ids_in_estab = nrow(absent_fire_ids_in_estab),
  absent_fire_ids_in_estab   = absent_fire_ids_in_estab,
  n_absent_cbgs_in_estab     = nrow(absent_cbgs_in_estab),
  absent_cbgs_in_estab       = absent_cbgs_in_estab,
  n_pairs_missing_in_estab   = nrow(pairs_missing_in_estab),
  pairs_missing_in_estab     = pairs_missing_in_estab,
  n_neither_fire_ids_in_estab= nrow(neither_fire_ids_in_estab),
  neither_fire_ids_in_estab  = neither_fire_ids_in_estab,
  n_neither_cbgs_in_estab    = nrow(neither_cbgs_in_estab),
  neither_cbgs_in_estab      = neither_cbgs_in_estab
)

summary_checks

## $n_absent_fire_ids_in_estab
## [1] 382
## 
## $absent_fire_ids_in_estab
## # A tibble: 382 × 1
##    Fire_ID
##    <fct>  
##  1 10     
##  2 17     
##  3 75     
##  4 84     
##  5 177    
##  6 189    
##  7 211    
##  8 218    
##  9 222    
## 10 232    
## # ℹ 372 more rows
## 
## $n_absent_cbgs_in_estab
## [1] 8195
## 
## $absent_cbgs_in_estab
## # A tibble: 8,195 × 1
##    cbg         
##    <chr>       
##  1 060014351031
##  2 060014351033
##  3 060014351041
##  4 060014380001
##  5 060014380002
##  6 060014381002
##  7 060014381003
##  8 060014381004
##  9 060014381005
## 10 060014381006
## # ℹ 8,185 more rows
## 
## $n_pairs_missing_in_estab
## [1] 2
## 
## $pairs_missing_in_estab
## # A tibble: 2 × 2
##   Fire_ID cbg         
##   <fct>   <chr>       
## 1 41      060650481001
## 2 20850   061110076122
## 
## $n_neither_fire_ids_in_estab
## [1] 344
## 
## $neither_fire_ids_in_estab
## # A tibble: 344 × 1
##    Fire_ID
##    <fct>  
##  1 10     
##  2 17     
##  3 75     
##  4 84     
##  5 189    
##  6 211    
##  7 218    
##  8 222    
##  9 232    
## 10 281    
## # ℹ 334 more rows
## 
## $n_neither_cbgs_in_estab
## [1] 7694
## 
## $neither_cbgs_in_estab
## # A tibble: 7,694 × 1
##    cbg         
##    <chr>       
##  1 060014511011
##  2 060014511012
##  3 060014511013
##  4 060014511014
##  5 060014512011
##  6 060014512022
##  7 060014512023
##  8 060014515041
##  9 060014515051
## 10 060014515052
## # ℹ 7,684 more rows

estab summary

n_pairs <- panel_regression_alarm %>%
  summarise(
    n_fire = n_distinct(Fire_ID, na.rm = TRUE),
    n_cbg  = n_distinct(cbg,     na.rm = TRUE)
  )
n_pairs

## # A tibble: 1 × 2
##   n_fire n_cbg
##    <int> <int>
## 1     38  2343

Data Preprocessor

Qianyi Wang (71707075)

2025-09-02

Load Data and Packages

CBG-fire and estab-fire data

join cbg-fire to estab-fire

1) classify groups and get a cbg-level treat when it’s uniform

2) attach to estab and decide row-level treat to use

NA Fields diagnostic

— 1) Diagnose why rows won’t match (this is why see NAs) ————-

2) simple breakdown: missing keys vs pair-not-found

3) deep dive: missing keys vs pair-not-found

4) Missing keys from CBG-fire (Daniel’s) by category and what they are

5) Missing keys from estab-yearmonth-fire (Mine) by category and what they are

estab summary