Final Project

Author

Khandker Qaiduzzaman

Project Proposal

This project aims to analyze the housing affordability crisis in New York State by integrating housing market data, demographic statistics, and macroeconomic indicators to understand the relationship between income levels, housing costs, and broader economic conditions. The motivation for performing this analysis is to better understand the widening gap between household incomes and housing prices/rents in New York, and to use data-driven methods to identify how affordability varies across regions and over time. This topic is important because housing costs directly impact financial stability, migration patterns, and quality of life, and combining multiple data sources provides a more complete picture than any single dataset alone. Data will be sourced from Zillow housing data (for rent and home price estimates), the U.S. Census American Community Survey 5-Year Estimates accessed via the Census API (for income, rent burden, and housing value indicators at county and state levels), and the Federal Reserve Economic Data (FRED) API (for macroeconomic variables such as interest rates, inflation, and mortgage rates). The analysis will follow a structured data science workflow involving data acquisition through APIs, data cleaning and transformation to align geographic and temporal scales, exploratory data analysis, and statistical modeling to examine affordability ratios and trends. Visualizations such as income-to-housing-cost ratios, geographic affordability comparisons, and macroeconomic trend overlays will be used to support findings, demonstrating how multi-source data integration can provide insight into housing affordability pressures in New York State.

Running Code

#install.packages("fredr")

library(tidyverse)
library(httr)
library(jsonlite)
library(tidycensus)
library(fredr)
library(lubridate)
library(zoo)
library(gt)

############################################################
# ACS 5-YEAR - MULTI-YEAR PULL (NY COUNTIES)
# CLEAN COUNTY NAMES FOR JOINING
############################################################

library(tidyverse)
library(jsonlite)

years <- 2009:2024

acs_5yr_all <- list()

for (yr in years) {
  
  url <- paste0(
    "https://api.census.gov/data/", yr, "/acs/acs5?",
    "get=NAME,B19013_001E,B25077_001E,B25064_001E&",
    "for=county:*&in=state:36"
  )
  
  try({
    
    raw <- fromJSON(url)
    
    df <- as.data.frame(raw[-1, ], stringsAsFactors = FALSE)
    colnames(df) <- raw[1, ]
    
    df <- df %>%
      mutate(year = yr) %>%
      rename(
        county_name   = NAME,
        median_income = B19013_001E,
        median_home   = B25077_001E,
        median_rent   = B25064_001E,
        state_fips    = state,
        county_fips   = county
      ) %>%
      
      # CLEAN COUNTY NAME
      mutate(
        county_name = county_name %>%
          str_remove(", New York") %>%
          str_trim()
      ) %>%
      
      # NUMERIC CONVERSION
      mutate(
        median_income = as.numeric(median_income),
        median_home   = as.numeric(median_home),
        median_rent   = as.numeric(median_rent)
      )
    
    acs_5yr_all[[as.character(yr)]] <- df
    
  }, silent = TRUE)
}

############################################################
# COMBINE ALL YEARS
############################################################

acs_ny_5yr <- bind_rows(acs_5yr_all)

############################################################
# EXPORT CSV
############################################################

write_csv(acs_ny_5yr, "acs_ny_5year_2005_2024.csv")

############################################################
# CHECK
############################################################

acs_ny_5yr |> head(5) |> gt()

county_name	median_income	median_home	median_rent	state_fips	county_fips	year
Albany County	55350	192500	830	36	001	2009
Allegany County	40917	65100	566	36	003	2009
Bronx County	33794	369600	885	36	005	2009
Broome County	43467	95600	598	36	007	2009
Cattaraugus County	41482	76000	576	36	009	2009

############################################################
# ACS 5-YEAR EDA — STATEWIDE TRENDS + AFFORDABILITY RATIO
############################################################
library(tidyverse)
library(scales)
library(patchwork)

# ── PREP ─────────────────────────────────────────────────
acs_clean <- acs_ny_5yr |>
  filter(median_income > 0, median_home > 0, median_rent > 0)

state_trends <- acs_clean |>
  group_by(year) |>
  summarise(
    `Median Income`       = mean(median_income, na.rm = TRUE),
    `Median Home Value`   = mean(median_home,   na.rm = TRUE),
    `Median Monthly Rent` = mean(median_rent,   na.rm = TRUE),
    .groups = "drop"
  ) |>
  pivot_longer(-year, names_to = "Metric", values_to = "Value")

# ── PLOT 1: TRENDS ───────────────────────────────────────
p1 <- ggplot(state_trends, aes(x = year, y = Value, color = Metric)) +
  geom_line(linewidth = 1.3) +
  geom_point(size = 2) +
  facet_wrap(~Metric, scales = "free_y", ncol = 1) +
  scale_color_manual(values = c(
    "Median Income"       = "#1B9E77",
    "Median Home Value"   = "#D95F02",
    "Median Monthly Rent" = "#7570B3"
  )) +
  scale_x_continuous(breaks = seq(2009, 2024, 2)) +
  scale_y_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  labs(
    title    = "NY Housing and Income Trends",
    subtitle = "State averages across all NY counties",
    x = "Year", y = "USD"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position  = "none",
    strip.text       = element_text(face = "bold", size = 11),
    plot.title       = element_text(face = "bold", size = 14),
    plot.subtitle    = element_text(size = 11, color = "gray40"),
    axis.title       = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# ── PLOT 2: PRICE-TO-INCOME RATIO ────────────────────────
pti_summary <- acs_clean |>
  mutate(price_to_income = median_home / median_income) |>
  group_by(year) |>
  summarise(
    pti_mean = mean(price_to_income, na.rm = TRUE),
    pti_lo   = quantile(price_to_income, 0.25, na.rm = TRUE),
    pti_hi   = quantile(price_to_income, 0.75, na.rm = TRUE),
    .groups  = "drop"
  )

p2 <- ggplot(pti_summary, aes(x = year, y = pti_mean)) +
  geom_ribbon(aes(ymin = pti_lo, ymax = pti_hi), fill = "#D95F02", alpha = 0.15) +
  geom_line(color = "#D95F02", linewidth = 1.3) +
  geom_point(color = "#D95F02", size = 2) +
  geom_hline(yintercept = 3, linetype = "dashed", color = "gray50") +
  annotate("text", x = 2009, y = 3.15, label = "3× affordability threshold",
           hjust = 0, size = 3.5, color = "gray40") +
  scale_x_continuous(breaks = seq(2009, 2024, 2)) +
  scale_y_continuous(labels = label_number(suffix = "×", accuracy = 0.1)) +
  labs(
    title    = "Home Price-to-Income Ratio (2009–2024)",
    subtitle = "Mean across NY counties; shaded band = interquartile range",
    x = "Year", y = "Home Value ÷ Median Income"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 14),
    plot.subtitle    = element_text(size = 11, color = "gray40"),
    axis.title       = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank()
  )

# ── COMBINE ──────────────────────────────────────────────
p1 + p2 +
  plot_layout(widths = c(1, 1)) +
  plot_annotation(
    caption = "Note: ACS 5-Year data averages survey responses over 5 years, producing smoother trends that reduce year-to-year volatility but may lag real-time market changes.",
    theme = theme(
      plot.caption = element_text(size = 10, color = "gray40", hjust = 0, face = "italic")
    )
  )

############################################################
# ROBUST FRED PULL (FIXES RANDOM 500 SERVER ERROR)
############################################################

library(tidyverse)
library(httr)
library(jsonlite)
library(lubridate)
library(zoo)

############################################################
# SAFE FUNCTION WITH RETRY LOGIC
############################################################

get_fred_data <- function(series_id, retries = 5, wait_sec = 10) {
  
  api_key <- Sys.getenv("FRED_API_KEY")
  
  url <- paste0(
    "https://api.stlouisfed.org/fred/series/observations?",
    "series_id=", series_id,
    "&api_key=", api_key,
    "&file_type=json"
  )
  
  for(i in 1:retries){
    
    cat("Trying", series_id, "- Attempt", i, "\n")
    
    result <- try(GET(url), silent = TRUE)
    
    if(!inherits(result, "try-error") && status_code(result) == 200){
      
      txt <- content(result, "text", encoding = "UTF-8")
      data_raw <- fromJSON(txt)
      
      df <- data_raw$observations %>%
        select(date, value) %>%
        mutate(
          date = as.Date(date),
          value = as.numeric(value)
        )
      
      return(df)
    }
    
    Sys.sleep(wait_sec)
  }
  
  stop(paste("Failed after retries:", series_id))
}

############################################################
# PULL DATA
############################################################

mortgage30 <- get_fred_data("MORTGAGE30US")

Trying MORTGAGE30US - Attempt 1

Sys.sleep(5)

fed_funds <- get_fred_data("FEDFUNDS")

Trying FEDFUNDS - Attempt 1

Sys.sleep(5)

cpi <- get_fred_data("CPIAUCSL")

Trying CPIAUCSL - Attempt 1

############################################################
# MONTHLY CLEAN DATASET
############################################################

mortgage_m <- mortgage30 %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(mortgage_rate = mean(value, na.rm = TRUE), .groups = "drop")

fed_m <- fed_funds %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(fed_rate = mean(value, na.rm = TRUE), .groups = "drop")

cpi_m <- cpi %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(cpi = mean(value, na.rm = TRUE), .groups = "drop")

fred_full_monthly <- mortgage_m %>%
  inner_join(fed_m, by = "month") %>%
  inner_join(cpi_m, by = "month") %>%
  arrange(month) %>%
  mutate(
    mortgage_rate = na.approx(mortgage_rate, na.rm = FALSE),
    fed_rate      = na.approx(fed_rate, na.rm = FALSE),
    cpi           = na.approx(cpi, na.rm = FALSE)
  )

############################################################
# EXPORT
############################################################

write_csv(fred_full_monthly, "fred_full_monthly_1971_present.csv")

fred_full_monthly |> head(5) |> gt()

month	mortgage_rate	fed_rate	cpi
1971-04-01	7.3100	4.16	40.1
1971-05-01	7.4250	4.63	40.3
1971-06-01	7.5300	4.91	40.5
1971-07-01	7.6040	5.31	40.6
1971-08-01	7.6975	5.57	40.7

############################################################
# FRED EDA — INTEREST RATES + CPI (YoY INFLATION)
############################################################
library(tidyverse)
library(scales)
library(patchwork)

# Okabe-Ito colorblind-friendly palette
COL_MORTGAGE <- "#0072B2"   # blue
COL_FED      <- "#D55E00"   # vermillion
COL_CPI_POS  <- "#D55E00"   # vermillion (inflation up)
COL_CPI_NEG  <- "#0072B2"   # blue (deflation)

# ── PLOT 1: MORTGAGE RATE vs FED FUNDS RATE ──────────────
p1 <- fred_full_monthly |>
  select(month,
         `30-Yr Mortgage Rate` = mortgage_rate,
         `Fed Funds Rate`      = fed_rate) |>
  pivot_longer(-month, names_to = "Series", values_to = "Rate") |>
  ggplot(aes(x = month, y = Rate, color = Series)) +
  geom_line(linewidth = 0.9) +
  scale_color_manual(values = c(
    "30-Yr Mortgage Rate" = COL_MORTGAGE,
    "Fed Funds Rate"      = COL_FED
  )) +
  scale_x_date(date_breaks = "5 years", date_labels = "%Y") +
  scale_y_continuous(labels = label_percent(scale = 1, accuracy = 1)) +
  labs(
    title    = "Interest Rates Over Time",
    subtitle = "30-year fixed mortgage rate closely tracks Fed policy cycles",
    x = NULL, y = "Rate (%)", color = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position  = "bottom",
    plot.title       = element_text(face = "bold", size = 14),
    plot.subtitle    = element_text(size = 11, color = "gray40"),
    axis.title       = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank()
  )

# ── PLOT 2: CPI YEAR-OVER-YEAR % CHANGE ──────────────────
cpi_yoy <- fred_full_monthly |>
  arrange(month) |>
  mutate(
    cpi_yoy = (cpi / lag(cpi, 12) - 1) * 100,
    positive = cpi_yoy >= 0
  ) |>
  filter(!is.na(cpi_yoy))

p2 <- ggplot(cpi_yoy, aes(x = month, y = cpi_yoy)) +
  geom_hline(yintercept = 0, color = "gray60", linewidth = 0.5) +
  geom_hline(yintercept = 2, linetype = "dashed", color = "gray50", linewidth = 0.5) +
  geom_col(aes(fill = positive), width = 20, show.legend = FALSE) +
  annotate("text", x = min(cpi_yoy$month), y = 2.4,
           label = "2% Fed target", hjust = 0, size = 3.2, color = "gray40") +
  scale_fill_manual(values = c("TRUE" = COL_CPI_POS, "FALSE" = COL_CPI_NEG)) +
  scale_x_date(date_breaks = "5 years", date_labels = "%Y") +
  scale_y_continuous(labels = label_percent(scale = 1, accuracy = 1)) +
  labs(
    title    = "Annual Inflation Rate (CPI Year-over-Year)",
    subtitle = "Monthly % change vs. same month prior year; 2022 spike reflects post-COVID surge",
    x = NULL, y = "YoY Change (%)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 14),
    plot.subtitle    = element_text(size = 11, color = "gray40"),
    axis.title       = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank()
  )

# ── COMBINE ───────────────────────────────────────────────
p1 / p2 +
  plot_annotation(
    caption = "Note: Monthly FRED data. Mortgage rate averaged to monthly from weekly Freddie Mac survey. Fed Funds Rate is the effective rate. CPI inflation = YoY % change in CPIAUCSL (seasonally adjusted).",
    theme = theme(
      plot.caption = element_text(size = 10, color = "gray40", hjust = 0, face = "italic")
    )
  )

############################################################
# DATASET 2: ACS-ALIGNED YEARLY FRED (CLEAN VERSION)
############################################################

library(dplyr)
library(lubridate)

start_date <- as.Date("2009-01-01")

fred_acs_ready <- fred_full_monthly %>%
  
  filter(month >= start_date) %>%
  mutate(year = year(month)) %>%
  
  group_by(year) %>%
  summarise(
    
    # macroeconomic level indicators (core variables)
    mortgage_rate = mean(mortgage_rate, na.rm = TRUE),
    fed_rate      = mean(fed_rate, na.rm = TRUE),
    cpi           = mean(cpi, na.rm = TRUE),
    
    .groups = "drop"
  ) %>%
  
  arrange(year)

# View
head(fred_acs_ready, 5)

# A tibble: 5 × 4
   year mortgage_rate fed_rate   cpi
  <dbl>         <dbl>    <dbl> <dbl>
1  2009          5.04    0.16   215.
2  2010          4.69    0.175  218.
3  2011          4.46    0.102  225.
4  2012          3.66    0.14   230.
5  2013          3.98    0.108  233.

# Save
write_csv(fred_acs_ready, "fred_acs_ready_2009_present.csv")

fred_acs_ready |> head(5) |> gt()

year	mortgage_rate	fed_rate	cpi
2009	5.041375	0.1600000	214.5647
2010	4.690583	0.1750000	218.0762
2011	4.455833	0.1016667	224.9230
2012	3.655917	0.1400000	229.5861
2013	3.981917	0.1075000	232.9518

library(tidyverse)

############################################################
# PULL HUD FMR DATA FROM GITHUB (2009–2024)
############################################################

years <- sprintf("%02d", 9:24)   

base_url <- "https://raw.githubusercontent.com/NafeesKhandker/DATA-607-Final-Project/main/HUD%20dataset/"

############################################################
# FUNCTION TO READ EACH FILE
############################################################

get_hud_year <- function(yy){

  file_url <- paste0(base_url, "FY", yy, "_FMRs.csv")
  
  message("Reading: ", file_url)

  df <- read_csv(file_url, show_col_types = FALSE)

  df %>%
    mutate(
      year = as.integer(paste0("20", yy))
    )
}

############################################################
# COMBINE ALL YEARS
############################################################

hud_all <- map_dfr(years, get_hud_year)

############################################################
# CLEAN COLUMN NAMES
############################################################

names(hud_all) <- names(hud_all) %>%
  str_to_lower() %>%
  str_replace_all(" ", "_")

############################################################
# VIEW DATA
############################################################

hud_all |> head(n = 5) |> gt()

stusps	state	countyname	pop	fmr_0	fmr_1	fmr_2	fmr_3	fmr_4	year
NY	36	Albany County	294565	686	711	868	1039	1135	2009
NY	36	Allegany County	49927	553	555	665	829	1018	2009
NY	36	Bronx County	1332650	1091	1180	1313	1615	1817	2009
NY	36	Broome County	200536	580	583	697	910	1067	2009
NY	36	Cattaraugus County	83955	560	562	676	888	1019	2009

############################################################
# HUD FMR — FMR BY COUNTY & BEDROOM SIZE (2024)
############################################################
library(tidyverse)
library(scales)
library(patchwork)

# ── PREP ─────────────────────────────────────────────────
hud_avg <- hud_all |>
  filter(stusps == "NY", year == 2024) |>                   
  mutate(county_label = str_remove(countyname, " County")) |>
  group_by(county_label) |>
  summarise(across(fmr_0:fmr_4, ~ mean(.x, na.rm = TRUE)), .groups = "drop") |>
  pivot_longer(fmr_0:fmr_4, names_to = "bedroom", values_to = "avg_fmr") |>
  mutate(bedroom = recode(bedroom,
    fmr_0 = "Studio", fmr_1 = "1-BR", fmr_2 = "2-BR",
    fmr_3 = "3-BR",   fmr_4 = "4-BR"
  ))

county_order <- hud_avg |>
  filter(bedroom == "2-BR") |>
  arrange(avg_fmr) |>
  pull(county_label)

hud_avg <- hud_avg |>
  mutate(
    county_label = factor(county_label, levels = county_order),
    bedroom      = factor(bedroom, levels = c("Studio","1-BR","2-BR","3-BR","4-BR")),
    panel        = ifelse(
      as.integer(county_label) > length(county_order) / 2,
      "Higher-Rent Counties", "Lower-Rent Counties"
    )
  )

COLOR_VALS <- c(
  "Studio" = "#332288", "1-BR" = "#44AA99", "2-BR" = "#117733",
  "3-BR"   = "#DDCC77", "4-BR" = "#CC6677"
)

# ── SHARED PLOT FUNCTION ──────────────────────────────────
dot_panel <- function(data, show_legend = FALSE) {
  ggplot(data, aes(x = avg_fmr, y = county_label, color = bedroom)) +
    geom_line(aes(group = county_label), color = "gray80", linewidth = 0.5) +
    geom_point(size = 2.2, alpha = 0.9) +
    scale_color_manual(values = COLOR_VALS, name = "Bedroom Size") +
    scale_x_continuous(
      labels = label_dollar(),
      expand = expansion(mult = c(0.02, 0.05))
    ) +
    scale_y_discrete(drop = TRUE) +
    facet_wrap(~ panel, scales = "free_y") +
    labs(x = "Monthly FMR (USD)", y = NULL) +
    theme_minimal(base_size = 11) +
    theme(
      legend.position    = if (show_legend) "top" else "none",
      legend.title       = element_text(face = "bold", size = 9),
      strip.text         = element_text(face = "bold", size = 11),
      axis.text.y        = element_text(size = 8.5),
      axis.title.x       = element_text(face = "bold"),
      axis.ticks.y       = element_blank(),
      panel.grid.major.y = element_line(color = "gray93"),
      panel.grid.major.x = element_line(color = "gray90"),
      panel.grid.minor   = element_blank()
    )
}

# ── BUILD TWO PANELS ──────────────────────────────────────
p_high <- dot_panel(filter(hud_avg, panel == "Higher-Rent Counties"), show_legend = TRUE)
p_low  <- dot_panel(filter(hud_avg, panel == "Lower-Rent Counties"),  show_legend = FALSE)

# ── COMBINE ───────────────────────────────────────────────
p_high + p_low +
  plot_annotation(
    title    = "HUD Fair Market Rents by NY County & Bedroom Size (2024)",          # changed
    subtitle = "Counties ordered by 2-BR FMR (ascending); connecting line shows spread across bedroom sizes",
    caption  = "Note: 2024 HUD Fair Market Rents represent the 40th percentile gross rent for standard units.\nCounties with wider horizontal spans have a larger bedroom-size premium, often reflecting tighter family-sized unit supply.",  # changed
    theme = theme(
      plot.title    = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10, color = "gray40"),
      plot.caption  = element_text(size = 9, color = "gray40", hjust = 0, face = "italic")
    )
  )

ggsave("hud_fmr_dotplot.png", width = 16, height = 10, dpi = 150)

############################################################
# FINAL MASTER DATASET (ACS5 VERSION)
# YEARS RESTRICTED TO 2009–2024
# CLEAN FEATURE SET (LOW MULTICOLLINEARITY)
############################################################

library(tidyverse)
library(lubridate)
library(zoo)

############################################################
# 1. CLEAN HUD DATA
############################################################

hud_clean <- hud_all %>%
  filter(
    stusps == "NY",
    year >= 2009,
    year <= 2024
  ) %>%
  mutate(
    county_name = countyname %>%
      str_remove(", NY") %>%
      str_remove(", New York") %>%
      str_trim()
  ) %>%
  select(
    county_name,
    year,
    pop,
    fmr_0,
    fmr_1,
    fmr_2,
    fmr_3,
    fmr_4
  ) %>%
  distinct()

############################################################
# 2. CLEAN ACS5 DATA
############################################################

acs_clean <- acs_ny_5yr %>%
  filter(
    year >= 2009,
    year <= 2024
  ) %>%
  distinct()

############################################################
# 3. CLEAN FRED DATA
############################################################

fred_clean <- fred_acs_ready %>%
  filter(
    year >= 2009,
    year <= 2024
  ) %>%
  distinct()

############################################################
# 4. BUILD FULL COUNTY-YEAR GRID
############################################################

all_counties <- sort(unique(hud_clean$county_name))
all_years    <- 2009:2024

full_grid <- expand_grid(
  county_name = all_counties,
  year = all_years
)

############################################################
# 5. JOIN ACS5
############################################################

master1 <- full_grid %>%
  left_join(acs_clean, by = c("county_name", "year"))

############################################################
# 6. JOIN HUD
############################################################

master2 <- master1 %>%
  left_join(hud_clean, by = c("county_name", "year"))

############################################################
# 7. JOIN FRED
############################################################

final_master <- master2 %>%
  left_join(fred_clean, by = "year")

############################################################
# 8. INFLATION RATE (FIXED)
############################################################

inflation_tbl <- fred_clean %>%
  arrange(year) %>%
  mutate(
    inflation_rate = (cpi / lag(cpi) - 1) * 100
  ) %>%
  select(year, inflation_rate)

final_master <- final_master %>%
  select(-any_of("inflation_rate")) %>%
  left_join(inflation_tbl, by = "year")

############################################################
# 9. MINIMAL & NON-REDUNDANT DERIVED FEATURES
############################################################

final_master <- final_master %>%
  mutate(
    
    # 1. Housing burden (market-based)
    housing_burden = (median_rent * 12) / median_income,
    
    # 2. HUD affordability burden (policy benchmark)
    hud_burden = (fmr_2 * 12) / median_income,
    
    # 3. Real income (CPI-adjusted purchasing power)
    real_income_index = (median_income / cpi) * 100,
    
    # 4. Housing stress from macro environment
    macro_housing_pressure = housing_burden * mortgage_rate
  )

############################################################
# 10. FINAL SORT
############################################################

final_master <- final_master %>%
  arrange(county_name, year)

############################################################
# 11. EXPORT
############################################################

write_csv(
  final_master,
  "ny_housing_final_master_dataset_acs5_2009_2024.csv"
)

final_master

# A tibble: 992 × 21
   county_name    year median_income median_home median_rent state_fips
   <chr>         <dbl>         <dbl>       <dbl>       <dbl> <chr>     
 1 Albany County  2009         55350      192500         830 36        
 2 Albany County  2010         56090      202500         855 36        
 3 Albany County  2011         57715      207300         880 36        
 4 Albany County  2012         59359      210200         890 36        
 5 Albany County  2013         59394      209300         904 36        
 6 Albany County  2014         59940      208600         918 36        
 7 Albany County  2015         59887      208400         919 36        
 8 Albany County  2016         60904      211100         931 36        
 9 Albany County  2017         62293      214400         969 36        
10 Albany County  2018         64535      218100         993 36        
# ℹ 982 more rows
# ℹ 15 more variables: county_fips <chr>, pop <dbl>, fmr_0 <dbl>, fmr_1 <dbl>,
#   fmr_2 <dbl>, fmr_3 <dbl>, fmr_4 <dbl>, mortgage_rate <dbl>, fed_rate <dbl>,
#   cpi <dbl>, inflation_rate <dbl>, housing_burden <dbl>, hud_burden <dbl>,
#   real_income_index <dbl>, macro_housing_pressure <dbl>

############################################################
# SHINY APP — NY HOUSING AFFORDABILITY
############################################################
library(shiny)
library(leaflet)
library(tigris)
library(sf)
library(tidyverse)

options(tigris_use_cache = TRUE)

ny_sf <- counties(state = "NY", cb = TRUE, year = 2020, progress_bar = FALSE) |>
  st_transform(crs = 4326) |>
  mutate(county_name = paste0(NAME, " County"))

metric_choices <- c(
  "Median Household Income"      = "median_income",
  "Median Home Value"            = "median_home",
  "Median Monthly Rent"          = "median_rent",
  "FMR — Studio (0 BR)"          = "fmr_0",
  "FMR — 1 Bedroom"              = "fmr_1",
  "FMR — 2 Bedroom"              = "fmr_2",
  "FMR — 3 Bedroom"              = "fmr_3",
  "FMR — 4 Bedroom"              = "fmr_4",
  "Housing Burden (Rent/Income)" = "housing_burden",
  "HUD Burden (FMR-2/Income)"    = "hud_burden",
  "Real Income Index"            = "real_income_index"
)

dollar_metrics <- c("median_income","median_home","median_rent",
                    "fmr_0","fmr_1","fmr_2","fmr_3","fmr_4")
ratio_metrics  <- c("housing_burden","hud_burden")

fmt_display <- function(m, val) {
  if (is.na(val) || !is.finite(val)) return("N/A")
  if (m %in% dollar_metrics)
    paste0("$", formatC(val, format = "f", digits = 0, big.mark = ","))
  else if (m %in% ratio_metrics)
    paste0(round(val * 100, 1), "%")
  else
    as.character(round(val, 2))
}

# ── UI ─────────────────────────────────────────────────────
ui <- fluidPage(
  tags$head(tags$style(HTML("
    body, .container-fluid { margin: 0; padding: 0; }

    #controls {
      display: flex;
      align-items: flex-end;
      gap: 32px;                          /* more breathing room between items */
      padding: 14px 24px;                 /* more vertical + horizontal padding */
      background: #f8f9fa;
      border-bottom: 2px solid #dee2e6;
      flex-wrap: wrap;
    }

    #controls h4 {
      margin: 0 16px 6px 0;              /* push title away from dropdowns */
      font-size: 1.05em;
      font-weight: 700;
      align-self: center;
      white-space: nowrap;
      color: #1e2d40;
    }

    .ctrl-group {
      display: flex;
      flex-direction: column;
      gap: 3px;                           /* small gap between label and input */
    }

    .ctrl-group label {
      font-size: 0.78em;
      font-weight: 600;
      color: #555;
      margin: 0;
    }

    .form-group { margin-bottom: 0; }
    .irs--shiny .irs-single { font-size: 11px; }
  "))),

  div(id = "controls",
    tags$h4("NY Housing Affordability Explorer"),
    div(class = "ctrl-group",
      tags$label("Metric"),
      selectInput("metric", label = NULL,
                  choices  = metric_choices,
                  selected = "housing_burden",
                  width    = "230px")
    ),
    div(class = "ctrl-group",
      tags$label("Year"),
      sliderInput("year", label = NULL,
                  min = 2009, max = 2024, value = 2024,
                  step = 1, sep = "", width = "320px",
                  animate = animationOptions(interval = 1000, loop = FALSE))
    )
  ),

  leafletOutput("map", width = "100%", height = "calc(100vh - 85px)")
)

# ── SERVER ─────────────────────────────────────────────────
server <- function(input, output, session) {

  year_data <- reactive({
    final_master |>
      filter(year == input$year) |>
      transmute(
        county_name,
        pop           = as.numeric(pop),
        median_income = as.numeric(median_income),
        median_rent   = as.numeric(median_rent),
        value         = as.numeric(.data[[input$metric]])
      )
  })

  map_df <- reactive({
    ny_sf |> left_join(year_data(), by = "county_name")
  })

  output$map <- renderLeaflet({
    leaflet() |>
      addProviderTiles(providers$CartoDB.Positron) |>
      setView(lng = -75.5, lat = 42.9, zoom = 7)
  })

  observe({
    req(input$metric, input$year)

    df     <- map_df()
    m      <- input$metric
    mlabel <- names(metric_choices)[metric_choices == m]

    finite_vals <- df$value[is.finite(df$value)]
    req(length(finite_vals) > 0)

    pal <- colorNumeric("YlOrRd", domain = finite_vals, na.color = "#cccccc")

    if (m %in% ratio_metrics) {
      leg_vals  <- finite_vals * 100
      leg_pal   <- colorNumeric("YlOrRd", domain = leg_vals, na.color = "#cccccc")
      leg_title <- paste0(mlabel, " (%)<br/><small>", input$year, "</small>")
    } else {
      leg_vals  <- finite_vals
      leg_pal   <- pal
      leg_title <- paste0(mlabel, "<br/><small>", input$year, "</small>")
    }

    # ── TOOLTIPS ─────────────────────────────────────────
    labels <- lapply(seq_len(nrow(df)), function(i) {
      cname <- df$county_name[i]
      pop   <- df$pop[i]
      inc   <- df$median_income[i]
      rent  <- df$median_rent[i]
      val   <- df$value[i]

      pop_fmt  <- if (is.na(pop)  || !is.finite(pop))  "N/A" else
                    formatC(pop, format = "f", digits = 0, big.mark = ",")
      inc_fmt  <- fmt_display("median_income", inc)
      rent_fmt <- fmt_display("median_rent",   rent)
      val_fmt  <- fmt_display(m, val)

      htmltools::HTML(paste0(
        "<div style='font-family:Arial,sans-serif; font-size:12px;",
                     "min-width:180px; line-height:1.5;'>",
          "<div style='font-size:13px; font-weight:700; color:#1e2d40;",
                      "border-bottom:1px solid #ddd; padding-bottom:4px;",
                      "margin-bottom:6px;'>",
            cname,
          "</div>",
          "<div style='color:#555;'>",
            "<b>Population:</b> ",    pop_fmt,  "<br/>",
            "<b>Median Income:</b> ", inc_fmt,  "<br/>",
            "<b>Median Rent:</b> ",   rent_fmt, "<br/>",
          "</div>",
          "<div style='margin-top:6px; padding-top:5px;",
                      "border-top:1px solid #ddd; color:#1e2d40;'>",
            "<b>", mlabel, ":</b> ",
            "<span style='color:#c0392b; font-weight:700;'>", val_fmt, "</span>",
          "</div>",
        "</div>"
      ))
    })

    leafletProxy("map", session) |>
      clearShapes() |>
      clearControls() |>
      addPolygons(
        data         = df,
        fillColor    = ~pal(value),
        fillOpacity  = 0.72,
        color        = "white",
        weight       = 1,
        highlight    = highlightOptions(
          weight       = 2.5,
          color        = "#1e2d40",
          fillOpacity  = 0.9,
          bringToFront = TRUE
        ),
        label        = labels,
        labelOptions = labelOptions(
          style     = list(
            "background-color" = "white",
            "border"           = "1px solid #ccc",
            "border-radius"    = "6px",
            "padding"          = "8px",
            "box-shadow"       = "2px 2px 6px rgba(0,0,0,0.15)"
          ),
          textsize  = "12px",
          direction = "auto",
          opacity   = 1
        ),
        layerId = ~county_name
      ) |>
      addLegend(
        pal      = leg_pal,
        values   = leg_vals,
        title    = leg_title,
        position = "bottomright",
        opacity  = 0.85
      )
  })
}

shinyApp(ui, server)