cbg_fire <- read_dta("baseline_sample_cells.dta") %>%
mutate(
cbg = str_pad(as.character(geoid), 12, pad = "0"),
Fire_ID = factor(fire_id)) %>%
select(-c(geoid, fire_id, archive_version_year))
## check if a cbg has only 1 treat value thruought time
# distinct_counts <- cbg_fire %>%
# group_by(Fire_ID, cbg) %>%
# summarise(n_treat_levels = n_distinct(treat), .groups = "drop")
panel_regression_alarm <- read_csv("estab_fire_month_alarm_panel.csv") %>%
select(c("placekey","Fire_ID","m_from_alarm","alarm_date","end_date","poi_cbg","ym","visits_month","visitors_month", "POST_alarm","month_idx", "band")) %>%
mutate(
cbg = str_pad(as.character(poi_cbg), 12, pad = "0"),
Fire_ID = factor(Fire_ID),
month_idx = as.integer(factor(month_idx, levels = sort(unique(month_idx))))
)%>%
select(-poi_cbg)
## Rows: 1495195 Columns: 71
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): placekey, locatoion_name, street_address, city, state, band, geom
## dbl (59): naics_code, postal_code, Fire_ID, dist_to_fire_km, poi_cbg, visit...
## date (5): alarm_date, end_date, ym, alarm_m, ym_fac
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
panel_regression_cont <- read_csv("estab_fire_month_cont_panel.csv") %>%
select(c("placekey","Fire_ID","m_from_contain","alarm_date","end_date","poi_cbg","ym","visits_month","visitors_month", "POST_cont","month_idx", "band")) %>%
mutate(
cbg = str_pad(as.character(poi_cbg), 12, pad = "0"),
Fire_ID = factor(Fire_ID),
month_idx = as.integer(factor(month_idx, levels = sort(unique(month_idx))))
)%>%
select(-poi_cbg)
## Rows: 1439908 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): placekey, locatoion_name, street_address, city, state, band, geom
## dbl (60): Fire_ID, poi_cbg, visits_month, visitors_month, m_from_contain, n...
## date (6): ym, contain_m, ym_fac, alarm_date, end_date, alarm_m
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbg_status <- cbg_fire %>%
group_by(Fire_ID, cbg) %>%
summarise(
has0 = any(treat == 0, na.rm = TRUE),
has1 = any(treat == 1, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
partial = has0 & has1,
cbg_treat = case_when(has1 & !has0 ~ 1L,
has0 & !has1 ~ 0L,
TRUE ~ NA_integer_)
) %>%
select(Fire_ID, cbg, partial, cbg_treat)
panel_alarm <- panel_regression_alarm %>%
left_join(cbg_status, by = c("Fire_ID","cbg")) %>%
mutate(
treat_pick = if_else(partial,
if_else(band == "inside", 1L, 0L),
cbg_treat)
)
# 3) Bring columns from cbg_fire for the chosen treat
# If cbg_fire has multiple rows per (Fire_ID, cbg, treat), pick one (e.g., first or min km).
cbg_fire_uniq <- cbg_fire %>%
arrange(Fire_ID, cbg, treat) %>%
distinct(Fire_ID, cbg, treat, .keep_all = TRUE)
cbg_alarm <- panel_alarm %>%
left_join(cbg_fire_uniq, by = c("Fire_ID","cbg","treat_pick" = "treat"))
print(cbg_alarm %>%
filter(is.na(treat_pick)))
## # A tibble: 228,467 × 16
## placekey Fire_ID m_from_alarm alarm_date end_date ym visits_month
## <chr> <fct> <dbl> <date> <date> <date> <dbl>
## 1 22c-222@5… 20850 -9 2019-10-31 2019-11-05 2019-01-01 10
## 2 22c-222@5… 20850 -8 2019-10-31 2019-11-05 2019-02-01 13
## 3 22c-222@5… 20850 -7 2019-10-31 2019-11-05 2019-03-01 15
## 4 22c-222@5… 20850 -6 2019-10-31 2019-11-05 2019-04-01 22
## 5 22c-222@5… 20850 -5 2019-10-31 2019-11-05 2019-05-01 29
## 6 22c-222@5… 20850 -4 2019-10-31 2019-11-05 2019-06-01 16
## 7 22c-222@5… 20850 -3 2019-10-31 2019-11-05 2019-07-01 28
## 8 22c-222@5… 20850 -2 2019-10-31 2019-11-05 2019-08-01 33
## 9 22c-222@5… 20850 -1 2019-10-31 2019-11-05 2019-09-01 39
## 10 22c-222@5… 20850 0 2019-10-31 2019-11-05 2019-10-01 35
## # ℹ 228,457 more rows
## # ℹ 9 more variables: visitors_month <dbl>, POST_alarm <dbl>, month_idx <int>,
## # band <chr>, cbg <chr>, partial <lgl>, cbg_treat <int>, treat_pick <dbl>,
## # km_to_fire <dbl>
There are estab with NA CBG level info.
unmatched <- anti_join(panel_regression_alarm, cbg_status, by = c("Fire_ID","cbg"))
nrow(unmatched) # how many will become NA?
## [1] 228467
head(unmatched, 10) # inspect a few: wrong cbg length? different Fire_ID?
## # A tibble: 10 × 12
## placekey Fire_ID m_from_alarm alarm_date end_date ym visits_month
## <chr> <fct> <dbl> <date> <date> <date> <dbl>
## 1 22c-222@5… 20850 -9 2019-10-31 2019-11-05 2019-01-01 10
## 2 22c-222@5… 20850 -8 2019-10-31 2019-11-05 2019-02-01 13
## 3 22c-222@5… 20850 -7 2019-10-31 2019-11-05 2019-03-01 15
## 4 22c-222@5… 20850 -6 2019-10-31 2019-11-05 2019-04-01 22
## 5 22c-222@5… 20850 -5 2019-10-31 2019-11-05 2019-05-01 29
## 6 22c-222@5… 20850 -4 2019-10-31 2019-11-05 2019-06-01 16
## 7 22c-222@5… 20850 -3 2019-10-31 2019-11-05 2019-07-01 28
## 8 22c-222@5… 20850 -2 2019-10-31 2019-11-05 2019-08-01 33
## 9 22c-222@5… 20850 -1 2019-10-31 2019-11-05 2019-09-01 39
## 10 22c-222@5… 20850 0 2019-10-31 2019-11-05 2019-10-01 35
## # ℹ 5 more variables: visitors_month <dbl>, POST_alarm <dbl>, month_idx <int>,
## # band <chr>, cbg <chr>
pairs_cbg_fire <- cbg_fire %>% distinct(Fire_ID, cbg)
# rows in estab that won't match the (Fire_ID, cbg) pairs in cbg_fire
unmatched <- panel_regression_alarm %>% anti_join(pairs_cbg_fire, by = c("Fire_ID","cbg"))
unmatched_simple <- unmatched %>%
mutate(reason = case_when(
is.na(Fire_ID) & is.na(cbg) ~ "missing_both",
is.na(Fire_ID) ~ "missing_Fire_ID",
is.na(cbg) ~ "missing_cbg",
TRUE ~ "(Fire_ID, cbg) not in cbg_fire"
)) %>%
count(reason, name = "n") %>%
arrange(desc(n))
unmatched_simple
## # A tibble: 1 × 2
## reason n
## <chr> <int>
## 1 (Fire_ID, cbg) not in cbg_fire 228467
fire_set <- unique(pairs_cbg_fire$Fire_ID)
cbg_set <- unique(pairs_cbg_fire$cbg)
unmatched_detailed <- unmatched %>%
filter(!is.na(Fire_ID), !is.na(cbg)) %>%
mutate(
Fire_ID_in_cbg_fire = Fire_ID %in% fire_set,
cbg_in_cbg_fire = cbg %in% cbg_set,
detail = case_when(
Fire_ID_in_cbg_fire & cbg_in_cbg_fire ~ "both exist separately, pair missing",
Fire_ID_in_cbg_fire & !cbg_in_cbg_fire ~ "Fire_ID exists, cbg absent globally",
!Fire_ID_in_cbg_fire & cbg_in_cbg_fire ~ "cbg exists, Fire_ID absent globally",
TRUE ~ "neither key exists globally"
)
) %>%
count(detail, name = "n") %>%
arrange(desc(n))
unmatched_detailed
## # A tibble: 4 × 2
## detail n
## <chr> <int>
## 1 cbg exists, Fire_ID absent globally 160164
## 2 both exist separately, pair missing 48444
## 3 neither key exists globally 14525
## 4 Fire_ID exists, cbg absent globally 5334
# Distinct key set from cbg_fire
pairs_cbg_fire <- cbg_fire %>% distinct(Fire_ID, cbg)
fire_set <- unique(pairs_cbg_fire$Fire_ID)
cbg_set <- unique(pairs_cbg_fire$cbg)
# Rows in estab that don't match any (Fire_ID, cbg) pair in cbg_fire
unmatched <- panel_regression_alarm %>% anti_join(pairs_cbg_fire, by = c("Fire_ID","cbg"))
unmatched_nonmissing <- unmatched %>% filter(!is.na(Fire_ID), !is.na(cbg))
# 1) Distinct Fire_IDs absent globally (and what they are)
absent_fire_ids <- unmatched %>%
filter(!is.na(Fire_ID), !(Fire_ID %in% fire_set)) %>%
distinct(Fire_ID) %>%
arrange(Fire_ID)
n_absent_fire_ids <- nrow(absent_fire_ids)
# 2) Distinct CBGs absent globally (and what they are)
absent_cbgs <- unmatched %>%
filter(!is.na(cbg), !(cbg %in% cbg_set)) %>%
distinct(cbg) %>%
arrange(cbg)
n_absent_cbgs <- nrow(absent_cbgs)
# 3) Distinct pairs missing (both keys exist separately but the pair is not present)
pairs_missing <- unmatched_nonmissing %>%
filter(Fire_ID %in% fire_set, cbg %in% cbg_set) %>%
distinct(Fire_ID, cbg) %>%
arrange(Fire_ID, cbg)
n_pairs_missing <- nrow(pairs_missing)
# 4) “Neither exist globally” category — distinct Fire_IDs and CBGs (and what they are)
neither_exist <- unmatched_nonmissing %>%
filter(!(Fire_ID %in% fire_set) & !(cbg %in% cbg_set))
neither_fire_ids <- neither_exist %>% distinct(Fire_ID) %>% arrange(Fire_ID)
n_neither_fire_ids <- nrow(neither_fire_ids)
neither_cbgs <- neither_exist %>% distinct(cbg) %>% arrange(cbg)
n_neither_cbgs <- nrow(neither_cbgs)
# --- Summary objects to view/print ---
list(
n_absent_fire_ids = n_absent_fire_ids,
absent_fire_ids = absent_fire_ids,
n_absent_cbgs = n_absent_cbgs,
absent_cbgs = absent_cbgs,
n_pairs_missing = n_pairs_missing,
pairs_missing = pairs_missing,
n_neither_fire_ids = n_neither_fire_ids,
neither_fire_ids = neither_fire_ids,
n_neither_cbgs = n_neither_cbgs,
neither_cbgs = neither_cbgs
)
## $n_absent_fire_ids
## [1] 4
##
## $absent_fire_ids
## # A tibble: 4 × 1
## Fire_ID
## <fct>
## 1 183
## 2 21017
## 3 21150
## 4 21341
##
## $n_absent_cbgs
## [1] 46
##
## $absent_cbgs
## # A tibble: 46 × 1
## cbg
## <chr>
## 1 060133131031
## 2 060133551123
## 3 060170306022
## 4 060170306024
## 5 060170306031
## 6 060350403031
## 7 060350403051
## 8 060374303012
## 9 060450101002
## 10 060450101003
## # ℹ 36 more rows
##
## $n_pairs_missing
## [1] 193
##
## $pairs_missing
## # A tibble: 193 × 2
## Fire_ID cbg
## <fct> <chr>
## 1 14 060570003002
## 2 14 060570004021
## 3 14 060570004024
## 4 14 060570007022
## 5 14 060570009002
## 6 40 060590219151
## 7 40 060590219152
## 8 40 060590219211
## 9 40 060590219241
## 10 40 060590320111
## # ℹ 183 more rows
##
## $n_neither_fire_ids
## [1] 2
##
## $neither_fire_ids
## # A tibble: 2 × 1
## Fire_ID
## <fct>
## 1 183
## 2 21017
##
## $n_neither_cbgs
## [1] 26
##
## $neither_cbgs
## # A tibble: 26 × 1
## cbg
## <chr>
## 1 060450108011
## 2 060650406072
## 3 060650406151
## 4 060650406152
## 5 060650406162
## 6 060650407022
## 7 060650407031
## 8 060650407032
## 9 060650408131
## 10 060650408141
## # ℹ 16 more rows
# --- distinct key sets ------------------------------------------------------
pairs_cbg_fire <- cbg_fire %>% distinct(Fire_ID, cbg)
pairs_estab <- panel_regression_alarm %>% distinct(Fire_ID, cbg)
estab_fire_set <- unique(pairs_estab$Fire_ID)
estab_cbg_set <- unique(pairs_estab$cbg)
# Pairs that are in cbg_fire but NOT in estab_yearmonth_fire
unmatched_from_cbgfire <- anti_join(pairs_cbg_fire, pairs_estab, by = c("Fire_ID","cbg"))
# --- categories & outputs ---------------------------------------------------
# 1) Fire_IDs in cbg_fire absent globally from estab
absent_fire_ids_in_estab <- unmatched_from_cbgfire %>%
filter(!(Fire_ID %in% estab_fire_set)) %>%
distinct(Fire_ID) %>% arrange(Fire_ID)
# 2) CBGs in cbg_fire absent globally from estab
absent_cbgs_in_estab <- unmatched_from_cbgfire %>%
filter(!(cbg %in% estab_cbg_set)) %>%
distinct(cbg) %>% arrange(cbg)
# 3) Pairs missing where both keys exist separately in estab
pairs_missing_in_estab <- unmatched_from_cbgfire %>%
filter(Fire_ID %in% estab_fire_set, cbg %in% estab_cbg_set) %>%
distinct(Fire_ID, cbg) %>% arrange(Fire_ID, cbg)
# 4) “Neither key exists” in estab (for these cbg_fire pairs)
neither_exist_in_estab <- unmatched_from_cbgfire %>%
filter(!(Fire_ID %in% estab_fire_set) & !(cbg %in% estab_cbg_set))
neither_fire_ids_in_estab <- neither_exist_in_estab %>%
distinct(Fire_ID) %>% arrange(Fire_ID)
neither_cbgs_in_estab <- neither_exist_in_estab %>%
distinct(cbg) %>% arrange(cbg)
# --- compact summary you can print/view -------------------------------------
summary_checks <- list(
n_absent_fire_ids_in_estab = nrow(absent_fire_ids_in_estab),
absent_fire_ids_in_estab = absent_fire_ids_in_estab,
n_absent_cbgs_in_estab = nrow(absent_cbgs_in_estab),
absent_cbgs_in_estab = absent_cbgs_in_estab,
n_pairs_missing_in_estab = nrow(pairs_missing_in_estab),
pairs_missing_in_estab = pairs_missing_in_estab,
n_neither_fire_ids_in_estab= nrow(neither_fire_ids_in_estab),
neither_fire_ids_in_estab = neither_fire_ids_in_estab,
n_neither_cbgs_in_estab = nrow(neither_cbgs_in_estab),
neither_cbgs_in_estab = neither_cbgs_in_estab
)
summary_checks
## $n_absent_fire_ids_in_estab
## [1] 382
##
## $absent_fire_ids_in_estab
## # A tibble: 382 × 1
## Fire_ID
## <fct>
## 1 10
## 2 17
## 3 75
## 4 84
## 5 177
## 6 189
## 7 211
## 8 218
## 9 222
## 10 232
## # ℹ 372 more rows
##
## $n_absent_cbgs_in_estab
## [1] 8195
##
## $absent_cbgs_in_estab
## # A tibble: 8,195 × 1
## cbg
## <chr>
## 1 060014351031
## 2 060014351033
## 3 060014351041
## 4 060014380001
## 5 060014380002
## 6 060014381002
## 7 060014381003
## 8 060014381004
## 9 060014381005
## 10 060014381006
## # ℹ 8,185 more rows
##
## $n_pairs_missing_in_estab
## [1] 2
##
## $pairs_missing_in_estab
## # A tibble: 2 × 2
## Fire_ID cbg
## <fct> <chr>
## 1 41 060650481001
## 2 20850 061110076122
##
## $n_neither_fire_ids_in_estab
## [1] 344
##
## $neither_fire_ids_in_estab
## # A tibble: 344 × 1
## Fire_ID
## <fct>
## 1 10
## 2 17
## 3 75
## 4 84
## 5 189
## 6 211
## 7 218
## 8 222
## 9 232
## 10 281
## # ℹ 334 more rows
##
## $n_neither_cbgs_in_estab
## [1] 7694
##
## $neither_cbgs_in_estab
## # A tibble: 7,694 × 1
## cbg
## <chr>
## 1 060014511011
## 2 060014511012
## 3 060014511013
## 4 060014511014
## 5 060014512011
## 6 060014512022
## 7 060014512023
## 8 060014515041
## 9 060014515051
## 10 060014515052
## # ℹ 7,684 more rows
n_pairs <- panel_regression_alarm %>%
summarise(
n_fire = n_distinct(Fire_ID, na.rm = TRUE),
n_cbg = n_distinct(cbg, na.rm = TRUE)
)
n_pairs
## # A tibble: 1 × 2
## n_fire n_cbg
## <int> <int>
## 1 38 2343