library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(sf)

## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

library(tidycensus)
library(censusxy)
library(progressr)
library(fixest)

## 
## Attaching package: 'fixest'
## 
## The following object is masked from 'package:scales':
## 
##     pvalue

library(lubridate)

options(tigris_use_cache = TRUE)

he <- read_csv("data/hcd_housing/housing_element/housing_element_table.csv")

## Rows: 4069 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): county, jurisdiction, type, link_text, view_link
## dbl  (1): planning_period
## date (1): received_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

permits_juris <- read_csv("data/hcd_housing/raw/tablea2.csv") %>% 
  group_by(
    juris = JURIS_NAME,
    county = CNTY_NAME,
    year = YEAR,
    cat = UNIT_CAT, 
    tenure = TENURE
  ) %>%
  summarise(
    permits = sum(NO_BUILDING_PERMITS, na.rm = TRUE),
    permits_vli = sum(BP_VLOW_INCOME_DR + BP_VLOW_INCOME_NDR, na.rm = TRUE),
    permits_li = sum(BP_LOW_INCOME_DR + BP_LOW_INCOME_NDR, na.rm = TRUE),
    permits_mod = sum(BP_MOD_INCOME_DR + BP_MOD_INCOME_NDR, na.rm = TRUE),
    permits_above_mod = sum(BP_ABOVE_MOD_INCOME, na.rm = TRUE)
  ) %>% 
  filter(!is.na(year))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 786023 Columns: 52
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (17): JURIS_NAME, CNTY_NAME, PRIOR_APN, APN, STREET_ADDRESS, PROJECT_NA...
## dbl  (32): YEAR, VLOW_INCOME_DR, VLOW_INCOME_NDR, LOW_INCOME_DR, LOW_INCOME_...
## date  (3): ENT_APPROVE_DT1, BP_ISSUE_DT1, CO_ISSUE_DT1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'juris', 'county', 'year', 'cat'. You can override using the `.groups` argument.

apps_juris <- read_csv("data/hcd_housing/raw/tablea.csv") %>% 
  filter(as.Date(APP_SUBMIT_DT, format="%m/%d/%Y") > as.Date("2016-01-01")) %>%
  mutate(month_year = format(as.Date(APP_SUBMIT_DT, format="%m/%d/%Y"), "%Y-%m")) %>%
  filter(UNIT_CAT == "5+", !(ABOVE_MOD_INCOME == TOT_PROPOSED_UNITS)) %>%
  group_by(
    juris = JURIS_NAME,
    county = CNTY_NAME,
    month_year
  ) %>%
  summarise(
    projects = n(),
    units = sum(TOT_PROPOSED_UNITS, na.rm = TRUE)
  ) %>% 
  filter(!is.na(month_year))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 302094 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): JURIS_NAME, CNTY_NAME, PRIOR_APN, APN, STREET_ADDRESS, PROJECT_NA...
## dbl  (11): YEAR, VLOW_INCOME_DR, VLOW_INCOME_NDR, LOW_INCOME_DR, LOW_INCOME_...
## date  (1): APP_SUBMIT_DT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'juris', 'county'. You can override using the `.groups` argument.

apps_juris %>% 
  mutate(month_year = as.Date(paste0(month_year, "-01"))) %>%
  arrange(month_year) %>%
  filter(month_year > as.Date("2017-12-01"), month_year < as.Date("2025-01-01")) %>%
  group_by(month_year) %>%
  summarise(units = sum(units, na.rm = TRUE)) %>%
  ggplot(aes(x = month_year, y = units)) +
  geom_line(size = 2) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  theme_minimal() +
  labs(
    x = "Month-Year",
    y = "Number of Proposed Units",
    title = "Proposed Units by Month"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ----------------------------
# 1) helpers: cleaning + region mapping + deadlines
# ----------------------------
clean_name <- function(x) {
  x %>%
    str_squish() %>%
    str_to_upper() %>%
    str_replace_all("[^A-Z0-9 ]", "") %>%
    str_squish()
}

county_to_region <- function(county_clean) {
  case_when(
    county_clean %in% c("ALAMEDA","CONTRA COSTA","MARIN","NAPA","SAN FRANCISCO","SAN MATEO","SANTA CLARA","SOLANO","SONOMA") ~ "ABAG",
    county_clean %in% c("MONTEREY","SANTA CRUZ") ~ "AMBAG",
    county_clean %in% c("EL DORADO","PLACER","SACRAMENTO","SUTTER","YOLO","YUBA") ~ "SACOG",
    county_clean %in% c("SAN DIEGO") ~ "SANDAG",
    county_clean %in% c("IMPERIAL","LOS ANGELES","ORANGE","RIVERSIDE","SAN BERNARDINO","VENTURA") ~ "SCAG",
    county_clean %in% c("SAN LUIS OBISPO") ~ "SLOCOG",
    TRUE ~ county_clean
  )
}

region_deadlines <- tribble(
  ~region,  ~due_date,
  "ABAG",   as.Date("2023-01-31"),
  "AMBAG",  as.Date("2023-12-15"),
  "SACOG",  as.Date("2021-05-15"),
  "SANDAG", as.Date("2021-04-30"),
  "SCAG",   as.Date("2021-10-15")
)

# month helpers
month_start_from_ym <- function(ym_str) {
  ymd(paste0(ym_str, "-01"))
}
month_end_from_start <- function(d) {
  ceiling_date(d, "month") - days(1)
}
month_id <- function(d) year(d) * 12L + month(d)

# ----------------------------
# 2) build BR windows (start = region due date; end = latest Adopted in planning_period 6)
# ----------------------------
he_clean <- he %>%
  mutate(
    juris_clean  = clean_name(jurisdiction),
    county_clean = clean_name(county),
    region       = county_to_region(county_clean),
    type_clean   = str_to_upper(str_squish(type)),
    planning_period = as.integer(planning_period),
    received_date = as.Date(received_date)
  ) %>%
  filter(region %in% region_deadlines$region)

adoption_6 <- he_clean %>%
  filter(planning_period == 6, str_detect(type_clean, "ADOPT")) %>%
  group_by(juris_clean, county_clean, region) %>%
  summarise(adopted_6 = max(received_date, na.rm = TRUE), .groups = "drop")

br_windows <- he_clean %>%
  distinct(juris_clean, county_clean, region) %>%
  left_join(region_deadlines, by = "region") %>%
  left_join(adoption_6, by = c("juris_clean", "county_clean", "region")) %>%
  mutate(
    # if adopted_6 exists but is before due_date, BR never turns on
    br_end_eff = case_when(
      is.na(adopted_6)         ~ as.Date(NA),
      adopted_6 < due_date     ~ due_date - days(1),
      TRUE                     ~ adopted_6
    ),
    br_start = due_date
  )

# ----------------------------
# 3) attach BR on/off to monthly applications
# ----------------------------
apps_clean <- apps_juris %>%
  mutate(
    juris_clean  = clean_name(juris),
    county_clean = clean_name(county),
    region       = county_to_region(county_clean),
    month_start  = month_start_from_ym(month_year),
    month_end    = month_end_from_start(month_start),
    month_id     = month_id(month_start)
  ) %>%
  filter(region %in% region_deadlines$region) %>%
  left_join(br_windows %>% select(juris_clean, county_clean, region, br_start, br_end_eff),
            by = c("juris_clean", "county_clean", "region"))

# If br_end_eff is NA (no adopted_6 yet), treat BR as on through the end of your apps panel
panel_max_end <- max(apps_clean$month_end, na.rm = TRUE)

apps_br <- apps_clean %>%
  mutate(
    br_end_open = coalesce(br_end_eff, panel_max_end),
    br_on = (month_end >= br_start) & (month_start <= br_end_open) & (br_end_open >= br_start),
    br_pre = month_end < br_start,
    br_post = month_start > br_end_open
  ) %>% 
  filter(month_start < as.Date("2025-01-01"))

apps_br_summ <- apps_br %>%
  filter(!is.na(br_end_eff)) %>% 
  mutate(
    br_status = case_when(
      br_pre  ~ "br_pre",
      br_on   ~ "br_on",
      br_post ~ "br_post",
      TRUE    ~ NA_character_
    ),
    br_status = factor(br_status, levels = c("br_pre", "br_on", "br_post"))
  ) %>%
  filter(!is.na(br_status)) %>%
  group_by(br_status) %>%
  summarise(
    n_juris_month = n(),
    n_juris       = n_distinct(juris_clean),
    avg_units     = mean(units, na.rm = TRUE),
    med_units     = median(units, na.rm = TRUE),
    avg_projects  = mean(projects, na.rm = TRUE),
    total_units   = sum(units, na.rm = TRUE),
    .groups = "drop"
  )

apps_br_summ

## # A tibble: 3 × 7
##   br_status n_juris_month n_juris avg_units med_units avg_projects total_units
##   <fct>             <int>   <int>     <dbl>     <dbl>        <dbl>       <dbl>
## 1 br_pre             1215     226      216.        84         2.50      262989
## 2 br_on               368     143      230.        80         2.07       84603
## 3 br_post             667     165      262.       106         2.35      175021

apps_br %>% 
  count(is.na(br_end_eff))

## # A tibble: 290 × 4
## # Groups:   juris, county [290]
##    juris           county         `is.na(br_end_eff)`     n
##    <chr>           <chr>          <lgl>               <int>
##  1 AGOURA HILLS    Los Angeles    FALSE                   4
##  2 ALAMEDA         Alameda        FALSE                   7
##  3 ALAMEDA COUNTY  Alameda        FALSE                   7
##  4 ALBANY          Alameda        FALSE                   5
##  5 ALHAMBRA        Los Angeles    FALSE                   7
##  6 ALISO VIEJO     Orange         FALSE                   4
##  7 AMERICAN CANYON Napa           TRUE                    5
##  8 ANAHEIM         Orange         FALSE                  10
##  9 ANTIOCH         Contra Costa   FALSE                   2
## 10 APPLE VALLEY    San Bernardino FALSE                  11
## # ℹ 280 more rows

# avg duration of BR window among treated juris
br_durations <- br_windows %>%
  filter(!is.na(br_end_eff)) %>%
  mutate(
    duration_days = as.integer(br_end_eff - br_start) + 1,
    duration_months = interval(br_start, br_end_eff) %/% months(1) + 1
  ) 

summary(br_durations$duration_months)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   13.00   16.87   24.00   51.00

br_durations %>% arrange(desc(duration_months))

## # A tibble: 361 × 9
##    juris_clean   county_clean region due_date   adopted_6  br_end_eff br_start  
##    <chr>         <chr>        <chr>  <date>     <date>     <date>     <date>    
##  1 CANYON LAKE   RIVERSIDE    SCAG   2021-10-15 2025-12-15 2025-12-15 2021-10-15
##  2 HIDDEN HILLS  LOS ANGELES  SCAG   2021-10-15 2025-12-15 2025-12-15 2021-10-15
##  3 ROCKLIN       PLACER       SACOG  2021-05-15 2025-08-05 2025-08-05 2021-05-15
##  4 SAN DIMAS     LOS ANGELES  SCAG   2021-10-15 2025-11-20 2025-11-20 2021-10-15
##  5 COMMERCE      LOS ANGELES  SCAG   2021-10-15 2025-10-30 2025-10-30 2021-10-15
##  6 DESERT HOT S… RIVERSIDE    SCAG   2021-10-15 2025-11-14 2025-11-14 2021-10-15
##  7 HESPERIA      SAN BERNARD… SCAG   2021-10-15 2025-11-06 2025-11-06 2021-10-15
##  8 PLACER COUNTY PLACER       SACOG  2021-05-15 2025-06-12 2025-06-12 2021-05-15
##  9 SEAL BEACH    ORANGE       SCAG   2021-10-15 2025-10-29 2025-10-29 2021-10-15
## 10 VILLA PARK    ORANGE       SCAG   2021-10-15 2025-10-30 2025-10-30 2021-10-15
## # ℹ 351 more rows
## # ℹ 2 more variables: duration_days <dbl>, duration_months <dbl>

OFF: Quarterly Balanced Panel

quarter_id <- function(d) year(d) * 4L + quarter(d)

apps_q <- apps_br %>%
  mutate(
    q_start = floor_date(month_start, "quarter"),
    units   = replace_na(units, 0),
    projects = replace_na(projects, 0),
    br_start_q = floor_date(br_start, "quarter")
  ) %>%
  group_by(juris_clean, region, q_start, br_start_q, br_end_eff) %>%
  summarise(
    units_q    = sum(units, na.rm = TRUE),
    projects_q = sum(projects, na.rm = TRUE),
    .groups = "drop"
  )

# balanced quarter panel (fill missing quarters with 0s)
apps_q_panel <- apps_q %>%
  group_by(juris_clean, region) %>%
  complete(
    q_start = seq(min(q_start, na.rm = TRUE),
                  max(q_start, na.rm = TRUE),
                  by = "quarter"),
    fill = list(units_q = 0, projects_q = 0)
  ) %>%
  ungroup() %>%
  left_join(
    apps_q %>% distinct(juris_clean, region, br_start_q, br_end_eff),
    by = c("juris_clean", "region", "br_start_q", "br_end_eff")
  ) %>%
  mutate(
    year = year(q_start),
    t = quarter_id(q_start),

    # cohort g_i = first quarter when BR is OFF
    # define "post" as the quarter AFTER the adoption quarter
    br_end_q = floor_date(br_end_eff, "quarter"),
    g = if_else(
      is.na(br_end_eff),
      NA_integer_,
      quarter_id(br_end_q %m+% months(3))
    ),

    in_br_era = q_start >= br_start_q,

    y_units    = log1p(units_q),
    y_projects = log1p(projects_q),

    region_t = interaction(region, t, drop = TRUE),
    region_y = interaction(region, year, drop = TRUE)
  )

apps_est_q <- apps_q_panel %>% filter(in_br_era)

Log Units + Log Projects

apps_est_q_win <- apps_est_q %>%
  mutate(rel_q = if_else(!is.na(g), t - g, NA_integer_)) %>%
  filter(is.na(g) | dplyr::between(rel_q, -8L, 8L))

# --- re-run SA event studies on the restricted sample ---
m_sa_units_q_win <- feols(
  y_units ~ sunab(g, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = apps_est_q_win
)

## NOTE: 21 observations removed because of NA values (RHS: 21).

## The variables 't::5:cohort::8087', 't::8:cohort::8089' and 't::8:cohort::8092' have been removed because of collinearity (see $collin.var).

m_sa_projects_q_win <- feols(
  y_projects ~ sunab(g, t, ref.p = -1) | juris_clean  + region_y,
  cluster = ~juris_clean,
  data = apps_est_q_win
)

## NOTE: 21 observations removed because of NA values (RHS: 21).
## The variables 't::5:cohort::8087', 't::8:cohort::8089' and 't::8:cohort::8092' have been removed because of collinearity (see $collin.var).

# --- plots, zoomed to ±8 quarters ---
iplot(
  m_sa_units_q_win,
  ref.line = 0,
  xlim = c(-8, 8),
  main = "BR window ending → log(1+units), quarterly (±8Q)",
  xlab = "Quarters relative to BR end (0 = first BR-off quarter)",
  ylab = "ATT relative to quarter -1"
)

iplot(
  m_sa_projects_q_win,
  ref.line = 0,
  xlim = c(-8, 8),
  main = "BR window ending → log(1+projects), quarterly (±8Q)",
  xlab = "Quarters relative to BR end (0 = first BR-off quarter)",
  ylab = "ATT relative to quarter -1"
)

summary(apps_est_q_win$y_units)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6931  3.7553  4.7318  4.7663  5.8763  9.6172

# apps_est_q <- apps_est_q %>%
#   mutate(post_end = !is.na(g) & t >= g)
# 
# m_post_q <- feols(
#   y_units ~ post_end | juris_clean + region_y,
#   cluster = ~juris_clean,
#   data = apps_est_q
# )

#etable(m_post_q)

# do agg estimates from m_sa_units_q_win 
summary(m_sa_units_q_win, agg = "att")

## OLS estimation, Dep. Var.: y_units
## Observations: 643
## Fixed-effects: juris_clean: 212,  region_y: 16
## Standard-errors: Clustered (juris_clean) 
##      Estimate Std. Error  t value Pr(>|t|) 
## ATT -0.360989   0.244807 -1.47459  0.14181 
## ... 3 variables were removed because of collinearity (t::5:cohort::8087, t::8:cohort::8089 and t::8:cohort::8092)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.787401     Adj. R2: 0.335731
##                  Within R2: 0.313117

summary(m_sa_projects_q_win, agg = "att")

## OLS estimation, Dep. Var.: y_projects
## Observations: 643
## Fixed-effects: juris_clean: 212,  region_y: 16
## Standard-errors: Clustered (juris_clean) 
##      Estimate Std. Error  t value Pr(>|t|)    
## ATT -0.192409   0.072928 -2.63835 0.008952 ** 
## ... 3 variables were removed because of collinearity (t::5:cohort::8087, t::8:cohort::8089 and t::8:cohort::8092)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.248495     Adj. R2: 0.633486
##                  Within R2: 0.407031

Avg Units / Project

# --- average units per project (quarter) ---
apps_est_q_win_upp <- apps_est_q_win %>%
  mutate(
    avg_units_per_project = if_else(projects_q > 0, units_q / projects_q, NA_real_),
    y_upp = log(avg_units_per_project),          # intensive margin
    # optional: keep finite logs only
    y_upp = if_else(is.finite(y_upp), y_upp, NA_real_)
  ) %>%
  filter(!is.na(y_upp))   # drops quarters with 0 projects (and any weird cases)

m_sa_upp_q_win <- feols(
  y_upp ~ sunab(g, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = apps_est_q_win_upp
)

## NOTE: 21 observations removed because of NA values (RHS: 21).

## The variables 't::5:cohort::8087', 't::8:cohort::8089' and 't::8:cohort::8092' have been removed because of collinearity (see $collin.var).

etable(m_sa_upp_q_win)

##                    m_sa_upp_q_win
## Dependent Var.:             y_upp
##                                  
## t = -8           -0.5702 (0.6767)
## t = -7           -0.8268 (0.7369)
## t = -6            0.0625 (0.4536)
## t = -5           0.8726* (0.4187)
## t = -4            0.5016 (0.3293)
## t = -3           -0.1323 (0.2209)
## t = -2            0.2213 (0.2328)
## t = 0            -0.1530 (0.2192)
## t = 1             0.0058 (0.2018)
## t = 2             0.3656 (0.2502)
## t = 3            -0.1047 (0.3041)
## t = 4            -0.0552 (0.2963)
## t = 5           -0.6100. (0.3376)
## t = 6            -0.5313 (0.5047)
## t = 7             0.5960 (0.5648)
## t = 8            -0.0147 (0.5528)
## Fixed-Effects:  -----------------
## juris_clean                   Yes
## region_y                      Yes
## _______________ _________________
## S.E.: Clustered   by: juris_clean
## Observations                  643
## R2                        0.68223
## Within R2                 0.32713
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

iplot(
  m_sa_upp_q_win,
  ref.line = 0,
  xlim = c(-8, 8),
  main = "BR window ending → log(avg units per project), quarterly (±8Q)",
  xlab = "Quarters relative to BR end (0 = first BR-off quarter)",
  ylab = "ATT relative to quarter -1"
)

% Projects bigger than pre-period 90th percentile

quarter_id <- function(d) year(d) * 4L + quarter(d)

# ----------------------------
# 1) project-level apps with robust date parsing
# ----------------------------
apps_raw <- read_csv("data/hcd_housing/raw/tablea.csv") %>% 
    filter(UNIT_CAT == "5+", !(ABOVE_MOD_INCOME == TOT_PROPOSED_UNITS)) %>%
clean_names()

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 302094 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): JURIS_NAME, CNTY_NAME, PRIOR_APN, APN, STREET_ADDRESS, PROJECT_NA...
## dbl  (11): YEAR, VLOW_INCOME_DR, VLOW_INCOME_NDR, LOW_INCOME_DR, LOW_INCOME_...
## date  (1): APP_SUBMIT_DT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

pick1 <- function(cands, nm) {
  out <- intersect(cands, nm)[1]
  if (is.na(out)) NA_character_ else out
}

date_col  <- pick1(c("app_submit_dt","app_submit_date","submitted_dt","submitted_date"), names(apps_raw))
units_col <- pick1(c("tot_proposed_units","total_proposed_units","proposed_units","tot_units","units"), names(apps_raw))
juris_col <- pick1(c("juris_name","jurisdiction","jurisdiction_name","juris"), names(apps_raw))
county_col<- pick1(c("cnty_name","county","county_name"), names(apps_raw))

if (any(is.na(c(date_col, units_col, juris_col, county_col)))) {
  stop(paste(
    "Missing needed column.",
    "\n date_col  =", date_col,
    "\n units_col =", units_col,
    "\n juris_col =", juris_col,
    "\n county_col=", county_col
  ))
}

id_col <- pick1(c("app_id","application_id","project_id","appl_id","project_number","application_number"), names(apps_raw))

date_orders <- c(
  "mdy", "mdy HMS", "mdy HM",
  "ymd", "ymd HMS", "ymd HM",
  "m/d/Y", "m/d/y",
  "Y-m-d", "Y-m-d H:M:S",
  "Y/m/d", "Y/m/d H:M:S"
)

apps_proj <- apps_raw %>%
  mutate(
    project_id = if (is.na(id_col)) row_number() else as.character(.data[[id_col]]),
    app_date   = suppressWarnings(parse_date_time(.data[[date_col]], orders = date_orders, tz = "UTC")) %>% as.Date(),
    proj_units = readr::parse_number(as.character(.data[[units_col]])),
    juris_clean  = clean_name(.data[[juris_col]]),
    county_clean = clean_name(.data[[county_col]]),
    region       = county_to_region(county_clean),
    q_start      = floor_date(app_date, "quarter"),
    year = year(q_start),
    region_y = interaction(region, year, drop = TRUE)
  ) %>%
  filter(!is.na(q_start), app_date > as.Date("2016-01-01")) %>%
  group_by(juris_clean, county_clean, region,region_y, q_start, project_id) %>%
  summarise(
    proj_units = if (all(is.na(proj_units))) NA_real_ else max(proj_units, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(!is.na(proj_units))

# ----------------------------
# 2) attach BR timing; define event time (g = first BR-off quarter)
# ----------------------------
apps_proj <- apps_proj %>%
  left_join(
    br_windows %>% select(juris_clean, county_clean, region, br_start, br_end_eff),
    by = c("juris_clean","county_clean","region")
  ) %>%
  mutate(
    br_start_q = floor_date(br_start, "quarter"),
    t          = quarter_id(q_start),
    br_end_q   = floor_date(br_end_eff, "quarter"),
    g          = if_else(is.na(br_end_eff), NA_integer_, quarter_id(br_end_q %m+% months(3))),
    rel_q      = if_else(!is.na(g), t - g, NA_integer_),
    in_br_era  = q_start >= br_start_q,

    # keeps cities that actually have (or could have) a BR window
    ever_br = is.na(br_end_eff) | (br_end_eff >= br_start)
  ) %>%
  filter(ever_br)

# ----------------------------
# 3) define juris-specific pre-period p90 size, using ALL BR-ERA pre quarters
# ----------------------------
pre_cut <- apps_proj %>%
  filter(in_br_era) %>%
  mutate(pre_period = if_else(is.na(g), TRUE, t < g))

pre_p90_by_juris <- pre_cut %>%
  group_by(juris_clean) %>%
  summarise(
    n_pre_projects = sum(pre_period),
    p90_pre_size = if (sum(pre_period) == 0) NA_real_
      else as.numeric(quantile(proj_units[pre_period], 0.75, na.rm = TRUE, type = 7)),
    .groups = "drop"
  ) %>%
  # optional: require some minimum pre support for a stable p90
  filter(n_pre_projects >= 10, !is.na(p90_pre_size))

apps_proj2 <- apps_proj %>%
  left_join(pre_p90_by_juris, by = "juris_clean") %>%
  filter(!is.na(p90_pre_size)) %>%
  mutate(
    large = proj_units > p90_pre_size
  )

# ----------------------------
# 4) collapse to juris × quarter: share large
# ----------------------------
apps_share_q <- apps_proj2 %>%
  group_by(juris_clean, region, q_start) %>%
  summarise(
    projects_q    = n(),
    share_large_q = mean(large, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  left_join(
    apps_proj2 %>% distinct(juris_clean, region, region_y, br_start_q, g),
    by = c("juris_clean","region")
  ) %>%
  mutate(
    t = quarter_id(q_start),
    rel_q = if_else(!is.na(g), t - g, NA_integer_),
    region_t = interaction(region, t, drop = TRUE),
    in_br_era = q_start >= br_start_q
  ) %>%
  filter(in_br_era) %>%
  filter(projects_q > 0)

## Warning in left_join(., apps_proj2 %>% distinct(juris_clean, region, region_y, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# ----------------------------
# 5) staggered DiD on share_large (±8 quarters)
# ----------------------------
apps_share_q_win <- apps_share_q %>%
  filter(is.na(g) | dplyr::between(rel_q, -8L, 8L))

m_share_large <- feols(
  share_large_q ~ sunab(g, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = apps_share_q_win
)

## NOTE: 32 observations removed because of NA values (RHS: 32).

# average ATT (single number)
summary(m_share_large, agg = "att")

## OLS estimation, Dep. Var.: share_large_q
## Observations: 761
## Fixed-effects: juris_clean: 16,  region_y: 32
## Standard-errors: Clustered (juris_clean) 
##     Estimate Std. Error t value Pr(>|t|)    
## ATT 0.078278   0.041602 1.88162 0.079439 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.134423     Adj. R2: 0.776109
##                  Within R2: 0.758532

# event study plot
iplot(
  m_share_large,
  ref.line = 0,
  xlim = c(-8, 8),
  main = "% projects > pre-period 90th percentile size, quarterly (±8Q)",
  xlab = "Quarters relative to BR end (0 = first BR-off quarter)",
  ylab = "ATT relative to quarter -1"
)

summary(apps_share_q$share_large_q)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.2500  0.3074  0.5000  1.0000

ON did

quarter_id <- function(d) year(d) * 4L + quarter(d)

# quarter collapse
apps_q_on <- apps_br %>%
  mutate(
    q_start = floor_date(month_start, "quarter"),
    units   = replace_na(units, 0),
    projects = replace_na(projects, 0)
  ) %>%
  group_by(juris_clean, region, q_start) %>%
  summarise(
    units_q    = sum(units, na.rm = TRUE),
    projects_q = sum(projects, na.rm = TRUE),
    .groups = "drop"
  )

# balanced quarter panel over the observed quarter range
q_min <- min(apps_q_on$q_start, na.rm = TRUE)
q_max <- max(apps_q_on$q_start, na.rm = TRUE)

apps_q_panel_on <- apps_q_on %>%
  group_by(juris_clean, region) %>%
  complete(
    q_start = seq(q_min, q_max, by = "quarter"),
    fill = list(units_q = 0, projects_q = 0)
  ) %>%
  ungroup() %>%
  left_join(
    br_windows %>% 
      select(juris_clean, region, br_start, br_end_eff),
    by = c("juris_clean", "region")
  ) %>%
  mutate(
    t = quarter_id(q_start),

    br_start_q = floor_date(br_start, "quarter"),

    # did BR ever actually turn on?
    # (your construction makes br_end_eff < br_start when adopted before due_date)
    br_ever_on = is.na(br_end_eff) | (br_end_eff >= br_start),

    # cohort g_on = first FULL quarter when BR is on (quarter after due-date quarter)
    g_on = if_else(br_ever_on, quarter_id(br_start_q %m+% months(3)), NA_integer_),

    rel_q_on = if_else(!is.na(g_on), t - g_on, NA_integer_),

    y_units    = log1p(units_q),
    y_projects = log1p(projects_q),

    avg_units_per_project = if_else(projects_q > 0, units_q / projects_q, NA_real_),
    y_upp = if_else(is.finite(log(avg_units_per_project)), log(avg_units_per_project), NA_real_),
    year = year(q_start),

    region_t = interaction(region, t, drop = TRUE),
    region_y = interaction(region, year, drop = TRUE)
  )

apps_on_treated <- apps_q_panel_on %>%
  filter(!is.na(g_on), dplyr::between(rel_q_on, -8L, 8L))

t_keep <- apps_on_treated %>% distinct(t) %>% pull(t)

apps_est_q_on_win <- apps_q_panel_on %>%
  filter(t %in% t_keep) %>% 
  filter(is.na(g_on) | dplyr::between(rel_q_on, -8L, 8L))

m_sa_units_q_on <- feols(
  y_units ~ sunab(g_on, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = apps_est_q_on_win
)

## NOTE: 220 observations removed because of NA values (RHS: 220).

## The variables 't::-7:cohort::8087', 't::-6:cohort::8094', 't::-5:cohort::8089', 't::-5:cohort::8097', 't::-3:cohort::8087', 't::-2:cohort::8094' and 7 others have been removed because of collinearity (see $collin.var).

m_sa_projects_q_on <- feols(
  y_projects ~ sunab(g_on, t, ref.p = -1) | juris_clean + region_y ,
  cluster = ~juris_clean,
  data = apps_est_q_on_win
)

## NOTE: 220 observations removed because of NA values (RHS: 220).
## The variables 't::-7:cohort::8087', 't::-6:cohort::8094', 't::-5:cohort::8089', 't::-5:cohort::8097', 't::-3:cohort::8087', 't::-2:cohort::8094' and 7 others have been removed because of collinearity (see $collin.var).

# average ATT (single number)
summary(m_sa_units_q_on,    agg = "att")

## OLS estimation, Dep. Var.: y_units
## Observations: 4,521
## Fixed-effects: juris_clean: 280,  region_y: 22
## Standard-errors: Clustered (juris_clean) 
##      Estimate Std. Error  t value Pr(>|t|)    
## ATT -0.147911   0.086855 -1.70298 0.089686 .  
## ... 13 variables were removed because of collinearity (t::-7:cohort::8087, t::-6:cohort::8094 and 11 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.62962     Adj. R2: 0.315511
##                 Within R2: 0.008916

summary(m_sa_projects_q_on, agg = "att")

## OLS estimation, Dep. Var.: y_projects
## Observations: 4,521
## Fixed-effects: juris_clean: 280,  region_y: 22
## Standard-errors: Clustered (juris_clean) 
##      Estimate Std. Error  t value Pr(>|t|)    
## ATT -0.037627   0.018677 -2.01466 0.044899 *  
## ... 13 variables were removed because of collinearity (t::-7:cohort::8087, t::-6:cohort::8094 and 11 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.353042     Adj. R2: 0.516858
##                  Within R2: 0.009583

apps_upp_on <- apps_est_q_on_win %>% filter(!is.na(y_upp))

m_sa_upp_q_on <- feols(
  y_upp ~ sunab(g_on, t, ref.p = -1) | juris_clean ,
  cluster = ~juris_clean,
  data = apps_upp_on
)

## NOTE: 76 observations removed because of NA values (RHS: 76).

summary(m_sa_upp_q_on, agg = "att")

## OLS estimation, Dep. Var.: y_upp
## Observations: 961
## Fixed-effects: juris_clean: 251
## Standard-errors: Clustered (juris_clean) 
##     Estimate Std. Error t value Pr(>|t|) 
## ATT 0.216092   0.157609 1.37106  0.17159 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.852797     Adj. R2: 0.234749
##                  Within R2: 0.062168

quarter_id <- function(d) year(d) * 4L + quarter(d)

# project-level tablea parsing (use your existing robust block)
# assumes you already created: apps_proj (juris_clean, county_clean, region, q_start, proj_units)

apps_proj_on <- apps_proj %>%
  left_join(
    br_windows %>% select(juris_clean, county_clean, region, br_start, br_end_eff),
    by = c("juris_clean","county_clean","region", "br_start", "br_end_eff")
  ) %>%
  mutate(
    br_start_q = floor_date(br_start, "quarter"),
    t = quarter_id(q_start),

    br_ever_on = is.na(br_end_eff) | (br_end_eff >= br_start),
    g_on = if_else(br_ever_on, quarter_id(br_start_q %m+% months(3)), NA_integer_),

    pre_period = !is.na(g_on) & (t < g_on)
  )

# juris-specific pre p90 (require some support)
pre_p90_by_juris_on <- apps_proj_on %>%
  group_by(juris_clean) %>%
  summarise(
    n_pre_projects = sum(pre_period, na.rm = TRUE),
    p90_pre_size = if (sum(pre_period, na.rm = TRUE) == 0) NA_real_
      else as.numeric(quantile(proj_units[pre_period], 0.90, na.rm = TRUE, type = 7)),
    .groups = "drop"
  ) %>%
  filter(n_pre_projects >= 5, !is.na(p90_pre_size))

apps_proj_on2 <- apps_proj_on %>%
  left_join(pre_p90_by_juris_on, by = "juris_clean") %>%
  filter(!is.na(p90_pre_size)) %>%
  mutate(large = proj_units > p90_pre_size)

# collapse to share large per quarter
apps_share_q_on <- apps_proj_on2 %>%
  group_by(juris_clean, region, q_start) %>%
  summarise(
    projects_q = n(),
    share_large_q = mean(large, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  left_join(
    apps_proj_on2 %>% distinct(juris_clean, region, g_on, br_start_q),
    by = c("juris_clean","region")
  ) %>%
  mutate(
    t = quarter_id(q_start),
    rel_q_on = if_else(!is.na(g_on), t - g_on, NA_integer_),
    region_t = interaction(region, t, drop = TRUE), 
    year = year(q_start),
    region_y = interaction(region, year, drop = TRUE)
  ) %>%
  filter(projects_q > 0)

# ±8 window and align controls to treated calendar quarters
treated_share <- apps_share_q_on %>%
  filter(!is.na(g_on), dplyr::between(rel_q_on, -8L, 8L))

t_keep_share <- treated_share %>% distinct(t) %>% pull(t)

apps_share_q_on_win <- apps_share_q_on %>%
  filter(t %in% t_keep_share) %>%
  filter(is.na(g_on) | dplyr::between(rel_q_on, -8L, 8L))

m_share_large_on <- feols(
  share_large_q ~ sunab(g_on, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = apps_share_q_on_win
)

## The variables 't::-7:cohort::8087', 't::-6:cohort::8094', 't::-5:cohort::8089', 't::-5:cohort::8097', 't::-3:cohort::8087', 't::-2:cohort::8094' and 7 others have been removed because of collinearity (see $collin.var).

summary(m_share_large_on, agg = "att")

## OLS estimation, Dep. Var.: share_large_q
## Observations: 547
## Fixed-effects: juris_clean: 85,  region_y: 22
## Standard-errors: Clustered (juris_clean) 
##      Estimate Std. Error   t value Pr(>|t|) 
## ATT -0.021215   0.043258 -0.490444   0.6251 
## ... 13 variables were removed because of collinearity (t::-7:cohort::8087, t::-6:cohort::8094 and 11 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.269199     Adj. R2: 0.159011
##                  Within R2: 0.10186

OFF but with lagged building permits

# ============================================================
# BR OFF (lag +4 quarters) → QUARTERLY PERMITS
# Outcome: permits issued (Table A2) in 5+ projects where
#          the project has ≥1 unit that is ≤ moderate income.
# Filter: BP_ISSUE_DT1 is not NA (so we can do quarters)
# ============================================================

a2_q <- read_csv("data/hcd_housing/raw/tablea2.csv", show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    juris_clean  = clean_name(juris_name),
    county_clean = clean_name(cnty_name),
    region       = county_to_region(county_clean)
  ) %>%
  filter(region %in% region_deadlines$region) %>%
  filter(unit_cat == "5+") %>%
  filter(!is.na(bp_issue_dt1)) %>%
  mutate(
    q_start = floor_date(bp_issue_dt1, "quarter"),
    t       = quarter_id(q_start),
    year    = year(q_start),

    # project has at least one ≤moderate unit (using BP income counts)
    bp_le_mod_units =
      coalesce(bp_vlow_income_dr, 0) + coalesce(bp_vlow_income_ndr, 0) +
      coalesce(bp_low_income_dr, 0)  + coalesce(bp_low_income_ndr, 0) +
      coalesce(bp_mod_income_dr, 0)  + coalesce(bp_mod_income_ndr, 0),

    has_le_mod = bp_le_mod_units > 0
  ) %>%
  filter(has_le_mod)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# collapse to juris × quarter (units issued in qualifying projects)
permits_lemod_5p_q <- a2_q %>%
  group_by(juris_clean, county_clean, region, q_start) %>%
  summarise(
    permits_units_in_qualproj = sum(no_building_permits, na.rm = TRUE),
    le_mod_units_in_qualproj  = sum(bp_le_mod_units,     na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # balanced quarterly panel (fill missing quarters with 0)
  group_by(juris_clean, county_clean, region) %>%
  complete(
    q_start = seq(min(q_start, na.rm = TRUE), max(q_start, na.rm = TRUE), by = "quarter"),
    fill = list(permits_units_in_qualproj = 0, le_mod_units_in_qualproj = 0)
  ) %>%
  ungroup() %>%
  mutate(
    t    = quarter_id(q_start),
    year = year(q_start),
    region_y = interaction(region, year, drop = TRUE)
  ) %>%
  left_join(
    br_windows %>% select(juris_clean, county_clean, region, br_start, br_end_eff),
    by = c("juris_clean", "county_clean", "region")
  ) %>%
  mutate(
    br_start_q = floor_date(br_start, "quarter"),
    br_end_q   = floor_date(br_end_eff, "quarter"),

    # g_base = first BR-off quarter (quarter after adoption quarter)
    g_base = if_else(is.na(br_end_eff), NA_integer_, quarter_id(br_end_q %m+% months(3))),

    # lag the “event” by 4 quarters (≈ 1 year)
    g_perm = if_else(is.na(g_base), NA_integer_, g_base + 4L),

    in_br_era = q_start >= br_start_q,
    rel_q     = if_else(!is.na(g_perm), t - g_perm, NA_integer_),

    y_perm_units = log1p(permits_units_in_qualproj),
    y_lemod_units = log1p(le_mod_units_in_qualproj)
  ) %>%
  filter(in_br_era) %>%
  filter(is.na(g_perm) | dplyr::between(rel_q, -8L, 8L))

# --- event study: total permitted units in qualifying projects ---
m_br_off_perm_units_lag4 <- feols(
  y_perm_units ~ sunab(g_perm, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = permits_lemod_5p_q
)

## NOTE: 23 observations removed because of NA values (RHS: 23).

## The variables 't::-3:cohort::8091', 't::2:cohort::8097' and 't::8:cohort::8092' have been removed because of collinearity (see $collin.var).

# --- event study: ≤moderate units (within qualifying projects) ---
m_br_off_lemod_units_lag4 <- feols(
  y_lemod_units ~ sunab(g_perm, t, ref.p = -1) | juris_clean + region_y,
  cluster = ~juris_clean,
  data = permits_lemod_5p_q
)

## NOTE: 23 observations removed because of NA values (RHS: 23).
## The variables 't::-3:cohort::8091', 't::2:cohort::8097' and 't::8:cohort::8092' have been removed because of collinearity (see $collin.var).

summary(m_br_off_lemod_units_lag4, agg = "att")

## OLS estimation, Dep. Var.: y_lemod_units
## Observations: 874
## Fixed-effects: juris_clean: 139,  region_y: 16
## Standard-errors: Clustered (juris_clean) 
##     Estimate Std. Error t value Pr(>|t|)    
## ATT -0.64341   0.375362 -1.7141 0.088755 .  
## ... 3 variables were removed because of collinearity (t::-3:cohort::8091, t::2:cohort::8097 and t::8:cohort::8092)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.48706     Adj. R2: 0.33773 
##                 Within R2: 0.200113

iplot(
  m_br_off_lemod_units_lag4,
  ref.line = 0,
  xlim = c(-8, 8),
  main = "BR ending → log(1+ ≤moderate units) in qualifying 5+ projects (lag +4Q)",
  xlab = "Quarters relative to (BR-off +4Q), 0 = first lagged post quarter",
  ylab = "ATT relative to quarter -1"
)

permits_lemod_5p_q %>%
  group_by(juris_clean) %>%
  summarise(
    min_q = min(q_start),
    max_q = max(q_start),
    n_q   = n(),
    n_q_expected = as.integer((max_q - min_q) / ddays(91)) + 1,  # rough; see exact below
    n_distinct_q = n_distinct(q_start),
    any_dupes = n_q != n_distinct_q
  ) %>%
  summarise(
    share_with_dupes = mean(any_dupes),
    min_coverage = min(n_distinct_q),
    max_coverage = max(n_distinct_q)
  )

## # A tibble: 1 × 3
##   share_with_dupes min_coverage max_coverage
##              <dbl>        <int>        <int>
## 1                0            1           15

permits_lemod_5p_q %>%
  group_by(juris_clean) %>%
  summarise(
    min_q = min(q_start),
    max_q = max(q_start),
    n_q = n_distinct(q_start),
    n_expected = length(seq(min_q, max_q, by = "quarter")),
    balanced_within = (n_q == n_expected)
  ) %>%
  count(balanced_within)

## # A tibble: 1 × 2
##   balanced_within     n
##   <lgl>           <int>
## 1 TRUE              144

q_global <- seq(min(permits_lemod_5p_q$q_start),
                max(permits_lemod_5p_q$q_start),
                by = "quarter")

permits_lemod_5p_q %>%
  group_by(juris_clean) %>%
  summarise(n_q = n_distinct(q_start)) %>%
  summarise(
    min_nq = min(n_q),
    max_nq = max(n_q),
    all_same = (min_nq == max_nq),
    global_expected = length(q_global)
  )

## # A tibble: 1 × 4
##   min_nq max_nq all_same global_expected
##    <int>  <int> <lgl>              <int>
## 1      1     15 FALSE                 15

q_global <- seq(min(permits_lemod_5p_q$q_start),
                max(permits_lemod_5p_q$q_start),
                by = "quarter")

permits_lemod_5p_q_rect <- permits_lemod_5p_q %>%
  group_by(juris_clean, county_clean, region) %>%
  complete(q_start = q_global,
           fill = list(permits_units_in_qualproj = 0,
                       le_mod_units_in_qualproj = 0)) %>%
  ungroup()

br_28_jan2026

Olivia Martin

2026-01-28

OFF: Quarterly Balanced Panel

Log Units + Log Projects

Avg Units / Project

% Projects bigger than pre-period 90th percentile

ON did

OFF but with lagged building permits