#1 Load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)     
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(MASS)       
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)     
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)       
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
#2 Load data and construct analysis variables
ahs.household.data <- read_csv("household.csv")
## Rows: 55669 Columns: 1180
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (635): CONTROL, JACPRIMARY, JACSECNDRY, JADEQUACY, JAIRRATE, JBATHEXCLU,...
## dbl (545): TOTROOMS, PERPOVLVL, OUTAGEFRQ, RENT, DINING, LAUNDY, RATINGHS, R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ahs.household.data <- ahs.household.data %>% mutate(tot.cost.burden = ((TOTHCAMT * 12) + MAINTAMT) / HINCP)

clean.research.data <- ahs.household.data %>% filter(is.finite(tot.cost.burden), tot.cost.burden > 0)

clean.research.data <- clean.research.data %>% 
 mutate(log.cost = log(tot.cost.burden))

clean.research.data <- clean.research.data %>% mutate(housing.age = 2023 - YRBUILT)

clean.research.data <- clean.research.data %>%
  mutate(age.group = case_when(
    housing.age < 15 ~ "Under 15 years",
    housing.age >= 15 & housing.age <= 30 ~ "15-30 years",
    housing.age >= 31 & housing.age <= 50 ~ "31-50 years",
    housing.age >= 51 ~ "51+ years"))

clean.research.data$age.group <- factor(
  clean.research.data$age.group,
  levels = c("Under 15 years", "15-30 years", "31-50 years", "51+ years"))

clean.research.data <- clean.research.data %>%
 mutate(
 income.group = case_when(
   PERPOVLVL < 200 ~ "Lower income",
   PERPOVLVL >= 200 & PERPOVLVL < 400 ~ "Moderate income",
   PERPOVLVL >= 400 ~ "Higher income" ))

clean.research.data$income.group <- factor(
 clean.research.data$income.group,
 levels = c("Lower income",
            "Moderate income",
            "Higher income"))
#3 Descriptive statistics

# Continuous variables used in core models
vars <- clean.research.data %>%
 dplyr::select(
   tot.cost.burden,
   housing.age,
   MARKETVAL,
   PERPOVLVL,
   RATINGHS)

stat.desc(vars)
##              tot.cost.burden  housing.age     MARKETVAL     PERPOVLVL
## nbr.val         5.463500e+04 5.463500e+04  5.463500e+04  5.463500e+04
## nbr.null        0.000000e+00 9.500000e+01  0.000000e+00  0.000000e+00
## nbr.na          0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
## min             6.972395e-04 0.000000e+00 -6.000000e+00 -6.000000e+00
## max             2.900400e+04 1.040000e+02  9.999998e+06  5.010000e+02
## range           2.900400e+04 1.040000e+02  1.000000e+07  5.070000e+02
## sum             2.420688e+05 2.782815e+06  1.687875e+10  1.462190e+07
## median          2.955556e-01 5.300000e+01  1.490830e+05  2.620000e+02
## mean            4.430655e+00 5.093466e+01  3.089367e+05  2.676288e+02
## SE.mean         5.959522e-01 1.172079e-01  2.515710e+03  8.240829e-01
## CI.mean.0.95    1.168071e+00 2.297283e-01  4.930810e+03  1.615209e+00
## var             1.940412e+04 7.505582e+02  3.457738e+11  3.710332e+04
## std.dev         1.392987e+02 2.739632e+01  5.880253e+05  1.926222e+02
## coef.var        3.143974e+01 5.378718e-01  1.903385e+00  7.197364e-01
##                   RATINGHS
## nbr.val       5.463500e+04
## nbr.null      0.000000e+00
## nbr.na        0.000000e+00
## min          -9.000000e+00
## max           1.000000e+01
## range         1.900000e+01
## sum           3.321590e+05
## median        8.000000e+00
## mean          6.079601e+00
## SE.mean       2.368003e-02
## CI.mean.0.95  4.641304e-02
## var           3.063625e+01
## std.dev       5.535002e+00
## coef.var      9.104219e-01
# Group frequencies for ANOVA categories
table(clean.research.data$age.group)
## 
## Under 15 years    15-30 years    31-50 years      51+ years 
##           5292           7279          14001          28063
table(clean.research.data$income.group)
## 
##    Lower income Moderate income   Higher income 
##           22956           12423           19256
#4 Baseline linear model using logged cost burden
affordable.housing.model <- lm(log.cost ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS, data = clean.research.data)

summary(affordable.housing.model)
## 
## Call:
## lm(formula = log.cost ~ housing.age + MARKETVAL + PERPOVLVL + 
##     RATINGHS, data = clean.research.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6973 -0.5380  0.0999  0.5774  9.4104 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.379e+00  1.064e-02  129.61   <2e-16 ***
## housing.age -2.628e-03  1.451e-04  -18.12   <2e-16 ***
## MARKETVAL    2.664e-07  6.907e-09   38.57   <2e-16 ***
## PERPOVLVL   -5.236e-03  2.447e-05 -213.97   <2e-16 ***
## RATINGHS    -1.113e-01  8.306e-04 -133.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9246 on 54630 degrees of freedom
## Multiple R-squared:  0.6962, Adjusted R-squared:  0.6962 
## F-statistic: 3.13e+04 on 4 and 54630 DF,  p-value: < 2.2e-16
#5 Regression assumption checks
#   a) Linearity (plot and raintest)
raintest(affordable.housing.model)
## 
##  Rainbow test
## 
## data:  affordable.housing.model
## Rain = 1.1041, df1 = 27318, df2 = 27312, p-value < 2.2e-16
plot(affordable.housing.model, which = 1)

#   b) Independence of errors (durbin-watson)
durbinWatsonTest(affordable.housing.model)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.03550215      1.928831       0
##  Alternative hypothesis: rho != 0
#   c) Homoscedasticity (plot, bptest) 
plot(affordable.housing.model,which=3)

bptest(affordable.housing.model)
## 
##  studentized Breusch-Pagan test
## 
## data:  affordable.housing.model
## BP = 632.91, df = 4, p-value < 2.2e-16
# d) Normality of residuals (QQ plot, KS test)

plot(affordable.housing.model,which=2)

ks.test(affordable.housing.model$residuals, "pnorm", mean = mean(affordable.housing.model$residuals), sd = sd(affordable.housing.model$residuals))
## Warning in ks.test.default(affordable.housing.model$residuals, "pnorm", : ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  affordable.housing.model$residuals
## D = 0.075341, p-value < 2.2e-16
## alternative hypothesis: two-sided
#  e) No multicolinarity (VIF, cor)

vif(affordable.housing.model)
## housing.age   MARKETVAL   PERPOVLVL    RATINGHS 
##    1.009275    1.054287    1.420175    1.350810
kitchen_sink_vars <- clean.research.data %>% dplyr::select(housing.age, MARKETVAL, PERPOVLVL, RATINGHS)

cor(kitchen_sink_vars)
##             housing.age   MARKETVAL   PERPOVLVL    RATINGHS
## housing.age  1.00000000 -0.03885332 -0.09352712 -0.05383571
## MARKETVAL   -0.03885332  1.00000000  0.21626259  0.05257105
## PERPOVLVL   -0.09352712  0.21626259  1.00000000  0.50620974
## RATINGHS    -0.05383571  0.05257105  0.50620974  1.00000000
#6 Two-way ANOVA: age group and income group

anova.model <- aov(
 tot.cost.burden ~ age.group * income.group,
 data = clean.research.data
)

summary(anova.model)
##                           Df    Sum Sq Mean Sq F value   Pr(>F)    
## age.group                  3 7.636e+04   25454   1.313    0.268    
## income.group               2 1.273e+06  636502  32.839 5.58e-15 ***
## age.group:income.group     6 4.176e+04    6961   0.359    0.905    
## Residuals              54623 1.059e+09   19383                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#7 Median burden by age and income group
interaction.plot(
 clean.research.data$age.group,
 clean.research.data$income.group,
 clean.research.data$tot.cost.burden,
 fun = median,
 ylab = "Median Housing Cost Burden",
 xlab = "Housing Age Group",
 trace.label = "Income Group")

#8 Supplemental repair mechanism models

# Does housing age predict resident housing quality ratings?
condition.model <- lm(
 RATINGHS ~ housing.age,
 data = clean.research.data
)

summary(condition.model)
## 
## Call:
## lm(formula = RATINGHS ~ housing.age, data = clean.research.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6336   0.4317   1.9429   3.5078   4.4976 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.6336006  0.0499181   132.9   <2e-16 ***
## housing.age -0.0108767  0.0008631   -12.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.527 on 54633 degrees of freedom
## Multiple R-squared:  0.002898,   Adjusted R-squared:  0.00288 
## F-statistic: 158.8 on 1 and 54633 DF,  p-value: < 2.2e-16
# Does housing age predict maintenance spending?
maintenance.model <- lm(
 MAINTAMT ~ housing.age + PERPOVLVL + MARKETVAL,
 data = clean.research.data
)

summary(maintenance.model)
## 
## Call:
## lm(formula = MAINTAMT ~ housing.age + PERPOVLVL + MARKETVAL, 
##     data = clean.research.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8201   -660   -260      7  98703 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.582e+02  2.625e+01  -6.028 1.67e-09 ***
## housing.age  2.433e+00  3.639e-01   6.685 2.34e-11 ***
## PERPOVLVL    1.780e+00  5.298e-02  33.598  < 2e-16 ***
## MARKETVAL    7.317e-04  1.729e-05  42.317  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2320 on 54631 degrees of freedom
## Multiple R-squared:  0.06342,    Adjusted R-squared:  0.06337 
## F-statistic:  1233 on 3 and 54631 DF,  p-value: < 2.2e-16
#9 Gamma GLM robustness check
glm.data <- clean.research.data %>%
 filter(tot.cost.burden < quantile(tot.cost.burden,.99))

burden.glm <- glm(
tot.cost.burden ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS,
data = glm.data,
family = Gamma(link=log)
)

summary(burden.glm)
## 
## Call:
## glm(formula = tot.cost.burden ~ housing.age + MARKETVAL + PERPOVLVL + 
##     RATINGHS, family = Gamma(link = log), data = glm.data)
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  5.118e-01  2.133e-02   23.997  < 2e-16 ***
## housing.age -1.657e-03  2.048e-04   -8.089 6.14e-16 ***
## MARKETVAL    2.472e-07  1.048e-08   23.582  < 2e-16 ***
## PERPOVLVL   -4.757e-03  3.418e-05 -139.170  < 2e-16 ***
## RATINGHS     1.191e-05  1.773e-03    0.007    0.995    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.44208)
## 
##     Null deviance: 66142  on 47005  degrees of freedom
## Residual deviance: 30337  on 47001  degrees of freedom
## AIC: -4536.3
## 
## Number of Fisher Scoring iterations: 8
#10 Final visualization: logged burden by income group 
ggplot(clean.research.data,
aes(x=income.group,y=log.cost)) +
geom_boxplot() +
labs(
title="Housing Cost Burden by Income Group",
x="Income Group",
y="Logged Housing Cost Burden")

#test to include in final maybe
#cuz it shows something i saw on the anova more clearly which is the slope for housing age vs burden by income groups
age.by.income <- clean.research.data %>%
 group_by(housing.age, income.group) %>%
 summarize(median.burden = median(tot.cost.burden, na.rm = TRUE),.groups = "drop")
ggplot(age.by.income,
 aes(housing.age, median.burden, color=income.group)) +
 geom_line() +
 coord_cartesian(xlim=c(0,50), ylim=c(0,2))