2. Sample construction

# drop children
data <- data[AGE >= 18]

# Gen by birth year
data[, BIRTHYEAR := YEAR - AGE]
data[, GENERATION := fcase(
  BIRTHYEAR >= 1928 & BIRTHYEAR <= 1945, "Silent",
  BIRTHYEAR >= 1946 & BIRTHYEAR <= 1964, "Boomer",
  BIRTHYEAR >= 1965 & BIRTHYEAR <= 1980, "GenX",
  BIRTHYEAR >= 1981 & BIRTHYEAR <= 1996, "Millennial",
  BIRTHYEAR >= 1997 & BIRTHYEAR <= 2012, "GenZ",
  default = "Other"
)]
data <- data[GENERATION != "Other"]

# Dropping 1970 & silent gen from the sample
# Reasoning:
#   (1) MORTAMT1 is not collected in the 1970 IPUMS sample (per usa_00003.cbk),
#       so owner housing-cost burden cannot be computed for 1970. This silently
#       dropped 1970 owners from the FAI sample, leaving 1970 represented almost
#       entirely by renters, which produced a degenerate reference cell in the
#       earlier model fit (intercept ~ -16 on the logit scale).
#   (2) Silent generation (born 1928-1945) appears in the 25-35 window almost
#       exclusively in 1970 in this extract. Dropping 1970 effectively drops
#       Silent regardless.
# Boomer becomes the natural reference cohort: cleanly observed from 1980 onward
# across multiple sample years with full mortgage data.
data <- data[YEAR != 1970]
data <- data[GENERATION != "Silent"]

data[, GENERATION := factor(GENERATION,
       levels = c("Boomer", "GenX", "Millennial", "GenZ"))]

# prime household-formation window
dt_prime <- data[AGE >= 25 & AGE <= 35]
n_before_cpi <- nrow(dt_prime)
n_before_cpi

## [1] 3367742

3. Inflation adjustment

cpi_lookup <- data.table(
  YEAR = c(1980, 1990, 2000, 2010, 2015, 2019, 2022, 2024),
  cpi  = c(82.4, 130.7, 172.2, 218.1, 237.0, 255.7, 292.7, 313.5)
)

stopifnot(length(setdiff(unique(dt_prime$YEAR), cpi_lookup$YEAR)) == 0)

# Recode IPUMS missing sentinels
dt_prime[VALUEH   == 9999999, VALUEH   := NA]
dt_prime[HHINCOME == 9999999, HHINCOME := NA]
dt_prime[INCTOT   == 9999999, INCTOT   := NA]
dt_prime[MORTAMT1 == 99999,   MORTAMT1 := NA]

dt_prime[COSTELEC %in% c(99993, 99994, 99995, 99999), COSTELEC := NA]
dt_prime[COSTGAS  %in% c(99993, 99994, 99995, 99999), COSTGAS  := NA]
dt_prime[COSTWATR %in% c(99993, 99994, 99995, 99999), COSTWATR := NA]
dt_prime[COSTFUEL %in% c(99993, 99994, 99995, 99999), COSTFUEL := NA]

# merge cpi on year
dt_prime <- cpi_lookup[dt_prime, on = "YEAR"]
stopifnot(nrow(dt_prime) == n_before_cpi)

# inflation-adjust to 2024
cpi_2024 <- 313.5
dt_prime[, valueh_real   := VALUEH   * (cpi_2024 / cpi)]
dt_prime[, hhinc_real    := HHINCOME * (cpi_2024 / cpi)]
dt_prime[, inc_real      := INCTOT   * (cpi_2024 / cpi)]
dt_prime[, rentgrs_real  := RENTGRS  * (cpi_2024 / cpi)]
dt_prime[, mortamt1_real := MORTAMT1 * (cpi_2024 / cpi)]
dt_prime[, costelec_real := COSTELEC * (cpi_2024 / cpi)]
dt_prime[, costgas_real  := COSTGAS  * (cpi_2024 / cpi)]
dt_prime[, costwatr_real := COSTWATR * (cpi_2024 / cpi)]
dt_prime[, costfuel_real := COSTFUEL * (cpi_2024 / cpi)]

# Restrict to own/rent universe (single application; was duplicated in original)
dt_prime <- dt_prime[OWNERSHP %in% c(1, 2)]

4. FAI components

4a. Housing cost burden ratio (universal — defined for owners and renters)

# NOTE: in IPUMS, MORTAMT1 is the MONTHLY mortgage payment, but
# COSTELEC/COSTGAS/COSTWATR/COSTFUEL are ANNUAL dollar amounts.
# RENTGRS already includes utilities and is reported monthly
dt_prime[, burden_ratio := fifelse(
  OWNERSHP == 2,  # renter
  rentgrs_real / (hhinc_real / 12),
  # owner:
  (mortamt1_real +
     (costelec_real + costgas_real + costwatr_real + costfuel_real) / 12) /
    (hhinc_real / 12)
)]

# trm infinities and extreme values
dt_prime[is.infinite(burden_ratio) | burden_ratio > 10 | burden_ratio < 0,
         burden_ratio := NA_real_]

4b. Price-to-income ratio (with year-level imputation for renters)

# owner P/I: standard
dt_prime[, price_to_income_hh := valueh_real / hhinc_real]

year_median_value <- dt_prime[OWNERSHP == 1 & !is.na(valueh_real),
                              .(median_valueh_real = median(valueh_real,
                                                            na.rm = TRUE)),
                              by = YEAR]
dt_prime <- year_median_value[dt_prime, on = "YEAR"]

# Renter P/I = year-median owner home value / household income
dt_prime[OWNERSHP == 2,
         price_to_income_hh := median_valueh_real / hhinc_real]

dt_prime[is.infinite(price_to_income_hh) | price_to_income_hh < 0,
         price_to_income_hh := NA_real_]
dt_prime[price_to_income_hh > 50, price_to_income_hh := NA_real_]

dt_prime[, .(
  n          = .N,
  burden_med = median(burden_ratio,       na.rm = TRUE),
  p2i_med    = median(price_to_income_hh, na.rm = TRUE)
), by = OWNERSHP]

##    OWNERSHP       n burden_med  p2i_med
##       <int>   <int>      <num>    <num>
## 1:        1 1883587  0.1916667 2.169197
## 2:        2 1365678  0.2263830 3.498834

5. FAI via PCA

fai_input <- dt_prime[!is.na(burden_ratio) & !is.na(price_to_income_hh),
                      .(burden_ratio, price_to_income_hh)]

pca_fit <- prcomp(fai_input, center = TRUE, scale. = TRUE)

print(pca_fit$rotation)

##                           PC1        PC2
## burden_ratio       -0.7071068  0.7071068
## price_to_income_hh -0.7071068 -0.7071068

print(summary(pca_fit))

## Importance of components:
##                           PC1    PC2
## Standard deviation     1.2588 0.6445
## Proportion of Variance 0.7923 0.2077
## Cumulative Proportion  0.7923 1.0000

dt_prime[!is.na(burden_ratio) & !is.na(price_to_income_hh),
         FAI := pca_fit$x[, 1]]

sign_check <- cor(dt_prime$FAI, dt_prime$price_to_income_hh, use = "complete.obs")
if (sign_check < 0) {
  dt_prime[, FAI := -FAI]
}

# distribution of FAI by generation
dt_prime[!is.na(FAI), .(
  n          = .N,
  fai_mean   = mean(FAI),
  fai_median = median(FAI)
), by = GENERATION]

##    GENERATION      n    fai_mean fai_median
##        <fctr>  <int>       <num>      <num>
## 1:     Boomer 324426 -0.13227404 -0.4698059
## 2:       GenX 304186 -0.05050935 -0.4257556
## 3: Millennial 702982  0.07155292 -0.3052493
## 4:       GenZ  67257  0.11860417 -0.2399871

6. Model preparation

# IMPORTANT: filter on `price_to_income_hh`, NOT on the PCA-based `FAI`.
model_dt <- dt_prime[!is.na(price_to_income_hh) & !is.na(MARST) &
                     !is.na(RACE) & !is.na(HISPAN) & !is.na(EDUC)]

# outcome: 1 = own, 0 = rent
model_dt[, own := as.integer(OWNERSHP == 1)]

# Collapse RACE to 4 categories (matching Census/JCHS convention)
# IPUMS RACE codes: 1=White, 2=Black, 3=AI/AN, 4=Chinese, 5=Japanese,
# 6=Other Asian/PI, 7=Other, 8=2 races, 9=3+ races
model_dt[, RACE_cat := fcase(
  RACE == 1,            "White",
  RACE == 2,            "Black",
  RACE %in% c(4, 5, 6), "Asian/PI",
  default               = "Other"
)]
model_dt[, RACE_cat := factor(RACE_cat,
       levels = c("White", "Black", "Asian/PI", "Other"))]

# Collapse EDUC to 4 categories (uses the coarse 0-11 EDUC variable)
model_dt[, EDUC_cat := fcase(
  EDUC <= 5,            "<HS",
  EDUC == 6,            "HS",
  EDUC %in% c(7, 8, 9), "Some college",
  EDUC >= 10,           "BA+",
  default               = NA_character_
)]
model_dt[, EDUC_cat := factor(EDUC_cat,
       levels = c("<HS", "HS", "Some college", "BA+"))]

# MARST and HISPAN as factors
model_dt[, MARST  := factor(MARST)]
model_dt[, HISPAN := factor(HISPAN)]

model_dt <- model_dt[!is.na(EDUC_cat)]

7. Headline logit: OWNERSHP ~ FAI × GENERATION + controls

The model uses person-weights (PERWT) for point estimation and year-clustered robust standard errors (via sandwich::vcovCL) for inference.

# Normalize PERWT to have mean 1.
model_dt[, w := PERWT / mean(PERWT)]

# Note: m1 uses the PCA-based FAI which requires both burden_ratio and
# price_to_income_hh to be non-NA.
m1_dt <- model_dt[!is.na(FAI)]

m1 <- glm(
  own ~ FAI * GENERATION + MARST + RACE_cat + HISPAN + EDUC_cat,
  data    = m1_dt,
  weights = w,
  family  = binomial(link = "logit")
)

vcov_cluster <- vcovCL(m1, cluster = ~YEAR, type = "HC1")
coef_table   <- coeftest(m1, vcov. = vcov_cluster)
print(coef_table)

## 
## z test of coefficients:
## 
##                            Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)              -2.8283604  0.6982524  -4.0506 5.108e-05 ***
## FAI                       0.1208920  0.0043321  27.9060 < 2.2e-16 ***
## GENERATIONGenX            0.0555485  0.6743806   0.0824 0.9343527    
## GENERATIONMillennial     -0.8699802  0.6801624  -1.2791 0.2008699    
## GENERATIONGenZ           -1.2657461  0.6693103  -1.8911 0.0586083 .  
## MARST2                   -0.1359686  0.1452338  -0.9362 0.3491675    
## MARST3                   -0.9564324  0.0979400  -9.7655 < 2.2e-16 ***
## MARST4                   -0.8481850  0.0933000  -9.0909 < 2.2e-16 ***
## MARST5                   -0.3544176  0.0940607  -3.7680 0.0001646 ***
## MARST6                   -0.4528536  0.0795644  -5.6917 1.258e-08 ***
## RACE_catBlack            -0.6929560  0.0987096  -7.0201 2.216e-12 ***
## RACE_catAsian/PI         -0.4706524  0.0478319  -9.8397 < 2.2e-16 ***
## RACE_catOther            -0.0358675  0.0605424  -0.5924 0.5535587    
## HISPAN1                  -0.8895947  0.0824093 -10.7948 < 2.2e-16 ***
## HISPAN2                  -0.5884271  0.0947675  -6.2092 5.327e-10 ***
## HISPAN3                  -1.1087214  0.1494737  -7.4175 1.193e-13 ***
## HISPAN4                  -0.6116646  0.0628639  -9.7300 < 2.2e-16 ***
## EDUC_catHS                0.6503610  0.0542101  11.9970 < 2.2e-16 ***
## EDUC_catSome college      0.7465290  0.1013269   7.3675 1.738e-13 ***
## EDUC_catBA+               0.7622140  0.1170553   6.5116 7.437e-11 ***
## FAI:GENERATIONGenX        0.0229925  0.0271469   0.8470 0.3970128    
## FAI:GENERATIONMillennial -0.0952060  0.0495867  -1.9200 0.0548588 .  
## FAI:GENERATIONGenZ       -0.3536628  0.0813378  -4.3481 1.373e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Pseudo-R^2 (McFadden) and N

ll_full <- logLik(m1)
m0      <- glm(own ~ 1, data = m1_dt, weights = w,
               family = binomial(link = "logit"))
ll_null <- logLik(m0)
mcfadden_r2 <- as.numeric(1 - ll_full / ll_null)

cat("N (observations) :", nrow(m1_dt), "\n")

## N (observations) : 1398851

cat("McFadden's R^2   :", round(mcfadden_r2, 4), "\n")

## McFadden's R^2   : 0.056

7b. Robustness: P/I-only FAI (load-bearing spec)

The headline m1 uses an FAI built from both burden_ratio and price_to_income_hh. Because burden_ratio is computed with different formulas for owners and renters, it is mechanically related to tenure status, which is why m1 produced a McFadden R² of 0.75 This chunk fits an alternative model m1_pi using a P/I-only FAI, where P/I is identically defined for owners (own home) and renters (year-median home value) and is therefore not mechanically tied to tenure.

# P/I-only FAI: standardized price_to_income_hh (no PCA needed for one variable;
model_dt[, FAI_pi := scale(price_to_income_hh)[, 1]]

m1_pi <- glm(
  own ~ FAI_pi * GENERATION + MARST + RACE_cat + HISPAN + EDUC_cat,
  data    = model_dt[!is.na(FAI_pi)],
  weights = w,
  family  = binomial(link = "logit")
)

vcov_cluster_pi <- vcovCL(m1_pi, cluster = ~YEAR, type = "HC1")
coef_table_pi   <- coeftest(m1_pi, vcov. = vcov_cluster_pi)
print(coef_table_pi)

## 
## z test of coefficients:
## 
##                              Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                  0.107520   0.071291   1.5082 0.1315102    
## FAI_pi                      -0.895731   0.163472  -5.4794 4.268e-08 ***
## GENERATIONGenX               0.137791   0.109311   1.2605 0.2074717    
## GENERATIONMillennial         0.208291   0.104417   1.9948 0.0460639 *  
## GENERATIONGenZ               0.165904   0.075999   2.1830 0.0290386 *  
## MARST2                      -0.295277   0.076904  -3.8395 0.0001233 ***
## MARST3                      -0.733821   0.056679 -12.9469 < 2.2e-16 ***
## MARST4                      -0.652322   0.071613  -9.1089 < 2.2e-16 ***
## MARST5                      -0.114145   0.041455  -2.7535 0.0058961 ** 
## MARST6                      -0.622398   0.055160 -11.2835 < 2.2e-16 ***
## RACE_catBlack               -0.640652   0.013057 -49.0640 < 2.2e-16 ***
## RACE_catAsian/PI            -0.408098   0.031961 -12.7686 < 2.2e-16 ***
## RACE_catOther               -0.150248   0.040425  -3.7167 0.0002018 ***
## HISPAN1                     -0.334442   0.045407  -7.3654 1.766e-13 ***
## HISPAN2                     -0.646181   0.087119  -7.4172 1.196e-13 ***
## HISPAN3                     -0.222617   0.053045  -4.1967 2.708e-05 ***
## HISPAN4                     -0.641774   0.027173 -23.6183 < 2.2e-16 ***
## EDUC_catHS                   0.449925   0.016369  27.4863 < 2.2e-16 ***
## EDUC_catSome college         0.466115   0.025605  18.2039 < 2.2e-16 ***
## EDUC_catBA+                  0.386295   0.037756  10.2313 < 2.2e-16 ***
## FAI_pi:GENERATIONGenX        0.143127   0.193051   0.7414 0.4584540    
## FAI_pi:GENERATIONMillennial  0.367830   0.173869   2.1156 0.0343822 *  
## FAI_pi:GENERATIONGenZ        0.302454   0.163888   1.8455 0.0649665 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ll_full_pi <- logLik(m1_pi)
m0_pi      <- glm(own ~ 1, data = model_dt[!is.na(FAI_pi)], weights = w,
                  family = binomial(link = "logit"))
ll_null_pi <- logLik(m0_pi)
mcfadden_r2_pi <- as.numeric(1 - ll_full_pi / ll_null_pi)

cat("N (P/I-only model)        :", nrow(model_dt[!is.na(FAI_pi)]), "\n")

## N (P/I-only model)        : 3150831

cat("McFadden's R^2 (P/I-only) :", round(mcfadden_r2_pi, 4), "\n")

## McFadden's R^2 (P/I-only) : 0.0843

cat("McFadden's R^2 (full FAI) :", round(mcfadden_r2,    4), "\n")

## McFadden's R^2 (full FAI) : 0.056

# cohort slope comparison
fai_main   <- coef(m1)["FAI"]
fai_main_pi <- coef(m1_pi)["FAI_pi"]

cat("FAI slope by cohort (full FAI / P/I-only FAI):\n")

## FAI slope by cohort (full FAI / P/I-only FAI):

cat(sprintf("  Boomer     : %6.2f  /  %6.2f\n",
            fai_main,
            fai_main_pi))

##   Boomer     :   0.12  /   -0.90

for (g in c("GenX", "Millennial", "GenZ")) {
  i_full <- coef(m1)[paste0("FAI:GENERATION", g)]
  i_pi   <- coef(m1_pi)[paste0("FAI_pi:GENERATION", g)]
  cat(sprintf("  %-10s : %6.2f  /  %6.2f\n",
              g,
              fai_main + i_full,
              fai_main_pi + i_pi))
}

##   GenX       :   0.14  /   -0.75
##   Millennial :   0.03  /   -0.53
##   GenZ       :  -0.23  /   -0.59

7c. Parallel outcome: cost burden

This model swaps the outcome from own to cost_burdened (= 1 if a household pays more than 30% of household income on housing, the JCHS threshold), keeping the same right-hand side. The question is different: not “who owns?” but “who is squeezed?”. Because burden_ratio is now part of the outcome rather than the predictor, the mechanical relationship that inflated m1’s R² is no longer an issue!!

# binary cost-burdened outcome
model_dt[, cost_burdened := as.integer(burden_ratio > 0.30)]

cat("N with cost_burdened defined:", sum(!is.na(model_dt$cost_burdened)), "\n")

## N with cost_burdened defined: 1398851

cat("Share cost-burdened (raw)   :",
    round(mean(model_dt$cost_burdened, na.rm = TRUE), 3), "\n")

## Share cost-burdened (raw)   : 0.32

model_dt[!is.na(cost_burdened),
         .(share_burdened = round(weighted.mean(cost_burdened, w), 3),
           n              = .N),
         by = .(GENERATION, OWNERSHP)][order(GENERATION, OWNERSHP)]

##    GENERATION OWNERSHP share_burdened      n
##        <fctr>    <int>          <num>  <int>
## 1:     Boomer        1          0.227  22521
## 2:     Boomer        2          0.268 301905
## 3:       GenX        1          0.300  23299
## 4:       GenX        2          0.320 280887
## 5: Millennial        1          0.288  25921
## 6: Millennial        2          0.370 677061
## 7:       GenZ        1          0.259   1593
## 8:       GenZ        2          0.371  65664

m2 <- glm(
  cost_burdened ~ FAI_pi * GENERATION + MARST + RACE_cat + HISPAN + EDUC_cat,
  data    = model_dt[!is.na(cost_burdened) & !is.na(FAI_pi)],
  weights = w,
  family  = binomial(link = "logit")
)

vcov_cluster_m2 <- vcovCL(m2, cluster = ~YEAR, type = "HC1")
coef_table_m2   <- coeftest(m2, vcov. = vcov_cluster_m2)
print(coef_table_m2)

## 
## z test of coefficients:
## 
##                              Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)                 -1.370632   0.254062  -5.3949 6.857e-08 ***
## FAI_pi                       1.969094   0.012407 158.7098 < 2.2e-16 ***
## GENERATIONGenX               0.164747   0.256388   0.6426  0.520504    
## GENERATIONMillennial         0.090600   0.238943   0.3792  0.704560    
## GENERATIONGenZ              -0.095727   0.222346  -0.4305  0.666809    
## MARST2                       0.072467   0.036790   1.9697  0.048868 *  
## MARST3                       0.121191   0.053492   2.2656  0.023476 *  
## MARST4                       0.021424   0.055073   0.3890  0.697266    
## MARST5                       0.166442   0.111362   1.4946  0.135017    
## MARST6                      -0.017376   0.022409  -0.7754  0.438100    
## RACE_catBlack                0.026833   0.042047   0.6382  0.523371    
## RACE_catAsian/PI             0.325594   0.034412   9.4617 < 2.2e-16 ***
## RACE_catOther                0.040716   0.023834   1.7083  0.087581 .  
## HISPAN1                      0.325819   0.024828  13.1228 < 2.2e-16 ***
## HISPAN2                      0.293186   0.028812  10.1759 < 2.2e-16 ***
## HISPAN3                      0.540179   0.052363  10.3160 < 2.2e-16 ***
## HISPAN4                      0.555419   0.036645  15.1569 < 2.2e-16 ***
## EDUC_catHS                   0.034019   0.042442   0.8015  0.422819    
## EDUC_catSome college         0.162858   0.053036   3.0707  0.002136 ** 
## EDUC_catBA+                  0.150463   0.063760   2.3598  0.018284 *  
## FAI_pi:GENERATIONGenX        0.242472   0.120175   2.0177  0.043627 *  
## FAI_pi:GENERATIONMillennial -0.027295   0.079259  -0.3444  0.730562    
## FAI_pi:GENERATIONGenZ       -0.125320   0.031990  -3.9175 8.948e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ll_full_m2 <- logLik(m2)
m0_m2      <- glm(cost_burdened ~ 1,
                  data = model_dt[!is.na(cost_burdened) & !is.na(FAI_pi)],
                  weights = w, family = binomial(link = "logit"))
ll_null_m2 <- logLik(m0_m2)
mcfadden_r2_m2 <- as.numeric(1 - ll_full_m2 / ll_null_m2)

cat("N (cost-burden model)        :",
    sum(!is.na(model_dt$cost_burdened) & !is.na(model_dt$FAI_pi)), "\n")

## N (cost-burden model)        : 1398851

cat("McFadden's R^2 (cost-burden) :", round(mcfadden_r2_m2, 4), "\n")

## McFadden's R^2 (cost-burden) : 0.2802

# Implied cohort effects on log-odds of being cost-burdened (at FAI_pi = 0,
# reference covariates) — direct read of the GENERATION main effects.
cat("Cohort effect on log-odds(cost-burdened) vs. Boomer reference, FAI=0:\n")

## Cohort effect on log-odds(cost-burdened) vs. Boomer reference, FAI=0:

for (g in c("GenX", "Millennial", "GenZ")) {
  est <- coef(m2)[paste0("GENERATION", g)]
  cat(sprintf("  %-10s : %+.3f\n", g, est))
}

##   GenX       : +0.165
##   Millennial : +0.091
##   GenZ       : -0.096

8. Predicted probabilities by generation across FAI range

fai_grid <- seq(quantile(model_dt$FAI_pi, 0.05, na.rm = TRUE),
                quantile(model_dt$FAI_pi, 0.95, na.rm = TRUE),
                length.out = 50)

modal <- function(x) names(sort(table(x), decreasing = TRUE))[1]
pred_grid <- expand.grid(
  FAI_pi     = fai_grid,
  GENERATION = levels(model_dt$GENERATION),
  MARST      = modal(model_dt$MARST),
  RACE_cat   = modal(model_dt$RACE_cat),
  HISPAN     = modal(model_dt$HISPAN),
  EDUC_cat   = modal(model_dt$EDUC_cat)
)
pred_grid$GENERATION <- factor(pred_grid$GENERATION,
                               levels = levels(model_dt$GENERATION))
pred_grid$MARST    <- factor(pred_grid$MARST,    levels = levels(model_dt$MARST))
pred_grid$RACE_cat <- factor(pred_grid$RACE_cat, levels = levels(model_dt$RACE_cat))
pred_grid$HISPAN   <- factor(pred_grid$HISPAN,   levels = levels(model_dt$HISPAN))
pred_grid$EDUC_cat <- factor(pred_grid$EDUC_cat, levels = levels(model_dt$EDUC_cat))

pred_grid$pred_own <- predict(m1_pi, newdata = pred_grid, type = "response")

ggplot(pred_grid, aes(x = FAI_pi, y = pred_own, color = GENERATION)) +
  geom_line(size = 1) +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  labs(
    title    = "Predicted homeownership probability by affordability and generation",
    subtitle = "Holding marital status, race, ethnicity, and education at modal values",
    x        = "Affordability index (z-scored price-to-income; higher = less attainable)",
    y        = "Predicted Pr(own home)",
    color    = "Generation"
  ) +
  theme_minimal()

8. save cleaned data and fitted models

out_dir <- "~/Desktop/DA6833/FinancialNihilism"

saveRDS(model_dt, file.path(out_dir, "model_dt.rds"))
saveRDS(m1_pi,    file.path(out_dir, "m1_pi.rds"))
saveRDS(m2,       file.path(out_dir, "m2.rds"))

cat("Saved:\n")

## Saved:

cat("  model_dt.rds (rows = ", nrow(model_dt), ")\n", sep = "")

##   model_dt.rds (rows = 3150831)

cat("  m1_pi.rds    (homeownership model, P/I-only FAI)\n")

##   m1_pi.rds    (homeownership model, P/I-only FAI)

cat("  m2.rds       (cost-burden model)\n")

##   m2.rds       (cost-burden model)

Structural Drivers of Financial Nihilism — Initial Analysis

Cleaning → FAI construction → cohort-by-FAI logit

Olivia Shipley

2026-03-25

0. Setup

1. Load data