#1 Load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
#2 Load data and construct analysis variables
ahs.household.data <- read_csv("household.csv")
## Rows: 55669 Columns: 1180
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (635): CONTROL, JACPRIMARY, JACSECNDRY, JADEQUACY, JAIRRATE, JBATHEXCLU,...
## dbl (545): TOTROOMS, PERPOVLVL, OUTAGEFRQ, RENT, DINING, LAUNDY, RATINGHS, R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ahs.household.data <- ahs.household.data %>% mutate(tot.cost.burden = ((TOTHCAMT * 12) + MAINTAMT) / HINCP)
clean.research.data <- ahs.household.data %>% filter(is.finite(tot.cost.burden), tot.cost.burden > 0)
clean.research.data <- clean.research.data %>%
mutate(log.cost = log(tot.cost.burden))
clean.research.data <- clean.research.data %>% mutate(housing.age = 2023 - YRBUILT)
clean.research.data <- clean.research.data %>%
mutate(age.group = case_when(
housing.age < 15 ~ "Under 15 years",
housing.age >= 15 & housing.age <= 30 ~ "15-30 years",
housing.age >= 31 & housing.age <= 50 ~ "31-50 years",
housing.age >= 51 ~ "51+ years"))
clean.research.data$age.group <- factor(
clean.research.data$age.group,
levels = c("Under 15 years", "15-30 years", "31-50 years", "51+ years"))
clean.research.data <- clean.research.data %>%
mutate(
income.group = case_when(
PERPOVLVL < 200 ~ "Lower income",
PERPOVLVL >= 200 & PERPOVLVL < 400 ~ "Moderate income",
PERPOVLVL >= 400 ~ "Higher income" ))
clean.research.data$income.group <- factor(
clean.research.data$income.group,
levels = c("Lower income",
"Moderate income",
"Higher income"))
#3 Descriptive statistics
# Continuous variables used in core models
vars <- clean.research.data %>%
dplyr::select(
tot.cost.burden,
housing.age,
MARKETVAL,
PERPOVLVL,
RATINGHS)
stat.desc(vars)
## tot.cost.burden housing.age MARKETVAL PERPOVLVL
## nbr.val 5.463500e+04 5.463500e+04 5.463500e+04 5.463500e+04
## nbr.null 0.000000e+00 9.500000e+01 0.000000e+00 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min 6.972395e-04 0.000000e+00 -6.000000e+00 -6.000000e+00
## max 2.900400e+04 1.040000e+02 9.999998e+06 5.010000e+02
## range 2.900400e+04 1.040000e+02 1.000000e+07 5.070000e+02
## sum 2.420688e+05 2.782815e+06 1.687875e+10 1.462190e+07
## median 2.955556e-01 5.300000e+01 1.490830e+05 2.620000e+02
## mean 4.430655e+00 5.093466e+01 3.089367e+05 2.676288e+02
## SE.mean 5.959522e-01 1.172079e-01 2.515710e+03 8.240829e-01
## CI.mean.0.95 1.168071e+00 2.297283e-01 4.930810e+03 1.615209e+00
## var 1.940412e+04 7.505582e+02 3.457738e+11 3.710332e+04
## std.dev 1.392987e+02 2.739632e+01 5.880253e+05 1.926222e+02
## coef.var 3.143974e+01 5.378718e-01 1.903385e+00 7.197364e-01
## RATINGHS
## nbr.val 5.463500e+04
## nbr.null 0.000000e+00
## nbr.na 0.000000e+00
## min -9.000000e+00
## max 1.000000e+01
## range 1.900000e+01
## sum 3.321590e+05
## median 8.000000e+00
## mean 6.079601e+00
## SE.mean 2.368003e-02
## CI.mean.0.95 4.641304e-02
## var 3.063625e+01
## std.dev 5.535002e+00
## coef.var 9.104219e-01
# Group frequencies for ANOVA categories
table(clean.research.data$age.group)
##
## Under 15 years 15-30 years 31-50 years 51+ years
## 5292 7279 14001 28063
table(clean.research.data$income.group)
##
## Lower income Moderate income Higher income
## 22956 12423 19256
#4 Baseline linear model using logged cost burden
affordable.housing.model <- lm(log.cost ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS, data = clean.research.data)
summary(affordable.housing.model)
##
## Call:
## lm(formula = log.cost ~ housing.age + MARKETVAL + PERPOVLVL +
## RATINGHS, data = clean.research.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6973 -0.5380 0.0999 0.5774 9.4104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.379e+00 1.064e-02 129.61 <2e-16 ***
## housing.age -2.628e-03 1.451e-04 -18.12 <2e-16 ***
## MARKETVAL 2.664e-07 6.907e-09 38.57 <2e-16 ***
## PERPOVLVL -5.236e-03 2.447e-05 -213.97 <2e-16 ***
## RATINGHS -1.113e-01 8.306e-04 -133.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9246 on 54630 degrees of freedom
## Multiple R-squared: 0.6962, Adjusted R-squared: 0.6962
## F-statistic: 3.13e+04 on 4 and 54630 DF, p-value: < 2.2e-16
#5 Regression assumption checks
# a) Linearity (plot and raintest)
raintest(affordable.housing.model)
##
## Rainbow test
##
## data: affordable.housing.model
## Rain = 1.1041, df1 = 27318, df2 = 27312, p-value < 2.2e-16
plot(affordable.housing.model, which = 1)

# b) Independence of errors (durbin-watson)
durbinWatsonTest(affordable.housing.model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.03550215 1.928831 0
## Alternative hypothesis: rho != 0
# c) Homoscedasticity (plot, bptest)
plot(affordable.housing.model,which=3)

bptest(affordable.housing.model)
##
## studentized Breusch-Pagan test
##
## data: affordable.housing.model
## BP = 632.91, df = 4, p-value < 2.2e-16
# d) Normality of residuals (QQ plot, KS test)
plot(affordable.housing.model,which=2)

ks.test(affordable.housing.model$residuals, "pnorm", mean = mean(affordable.housing.model$residuals), sd = sd(affordable.housing.model$residuals))
## Warning in ks.test.default(affordable.housing.model$residuals, "pnorm", : ties
## should not be present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: affordable.housing.model$residuals
## D = 0.075341, p-value < 2.2e-16
## alternative hypothesis: two-sided
# e) No multicolinarity (VIF, cor)
vif(affordable.housing.model)
## housing.age MARKETVAL PERPOVLVL RATINGHS
## 1.009275 1.054287 1.420175 1.350810
kitchen_sink_vars <- clean.research.data %>% dplyr::select(housing.age, MARKETVAL, PERPOVLVL, RATINGHS)
cor(kitchen_sink_vars)
## housing.age MARKETVAL PERPOVLVL RATINGHS
## housing.age 1.00000000 -0.03885332 -0.09352712 -0.05383571
## MARKETVAL -0.03885332 1.00000000 0.21626259 0.05257105
## PERPOVLVL -0.09352712 0.21626259 1.00000000 0.50620974
## RATINGHS -0.05383571 0.05257105 0.50620974 1.00000000
#6 Two-way ANOVA: age group and income group
anova.model <- aov(
tot.cost.burden ~ age.group * income.group,
data = clean.research.data
)
summary(anova.model)
## Df Sum Sq Mean Sq F value Pr(>F)
## age.group 3 7.636e+04 25454 1.313 0.268
## income.group 2 1.273e+06 636502 32.839 5.58e-15 ***
## age.group:income.group 6 4.176e+04 6961 0.359 0.905
## Residuals 54623 1.059e+09 19383
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#7 Median burden by age and income group
interaction.plot(
clean.research.data$age.group,
clean.research.data$income.group,
clean.research.data$tot.cost.burden,
fun = median,
ylab = "Median Housing Cost Burden",
xlab = "Housing Age Group",
trace.label = "Income Group")

#8 Supplemental repair mechanism models
# Does housing age predict resident housing quality ratings?
condition.model <- lm(
RATINGHS ~ housing.age,
data = clean.research.data
)
summary(condition.model)
##
## Call:
## lm(formula = RATINGHS ~ housing.age, data = clean.research.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6336 0.4317 1.9429 3.5078 4.4976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6336006 0.0499181 132.9 <2e-16 ***
## housing.age -0.0108767 0.0008631 -12.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.527 on 54633 degrees of freedom
## Multiple R-squared: 0.002898, Adjusted R-squared: 0.00288
## F-statistic: 158.8 on 1 and 54633 DF, p-value: < 2.2e-16
# Does housing age predict maintenance spending?
maintenance.model <- lm(
MAINTAMT ~ housing.age + PERPOVLVL + MARKETVAL,
data = clean.research.data
)
summary(maintenance.model)
##
## Call:
## lm(formula = MAINTAMT ~ housing.age + PERPOVLVL + MARKETVAL,
## data = clean.research.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8201 -660 -260 7 98703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.582e+02 2.625e+01 -6.028 1.67e-09 ***
## housing.age 2.433e+00 3.639e-01 6.685 2.34e-11 ***
## PERPOVLVL 1.780e+00 5.298e-02 33.598 < 2e-16 ***
## MARKETVAL 7.317e-04 1.729e-05 42.317 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2320 on 54631 degrees of freedom
## Multiple R-squared: 0.06342, Adjusted R-squared: 0.06337
## F-statistic: 1233 on 3 and 54631 DF, p-value: < 2.2e-16
#9 Gamma GLM robustness check
glm.data <- clean.research.data %>%
filter(tot.cost.burden < quantile(tot.cost.burden,.99))
burden.glm <- glm(
tot.cost.burden ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS,
data = glm.data,
family = Gamma(link=log)
)
summary(burden.glm)
##
## Call:
## glm(formula = tot.cost.burden ~ housing.age + MARKETVAL + PERPOVLVL +
## RATINGHS, family = Gamma(link = log), data = glm.data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.118e-01 2.133e-02 23.997 < 2e-16 ***
## housing.age -1.657e-03 2.048e-04 -8.089 6.14e-16 ***
## MARKETVAL 2.472e-07 1.048e-08 23.582 < 2e-16 ***
## PERPOVLVL -4.757e-03 3.418e-05 -139.170 < 2e-16 ***
## RATINGHS 1.191e-05 1.773e-03 0.007 0.995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.44208)
##
## Null deviance: 66142 on 47005 degrees of freedom
## Residual deviance: 30337 on 47001 degrees of freedom
## AIC: -4536.3
##
## Number of Fisher Scoring iterations: 8
#10 Final visualization: logged burden by income group
ggplot(clean.research.data,
aes(x=income.group,y=log.cost)) +
geom_boxplot() +
labs(
title="Housing Cost Burden by Income Group",
x="Income Group",
y="Logged Housing Cost Burden")

#test to include in final maybe
#cuz it shows something i saw on the anova more clearly which is the slope for housing age vs burden by income groups
age.by.income <- clean.research.data %>%
group_by(housing.age, income.group) %>%
summarize(median.burden = median(tot.cost.burden, na.rm = TRUE),.groups = "drop")
ggplot(age.by.income,
aes(housing.age, median.burden, color=income.group)) +
geom_line() +
coord_cartesian(xlim=c(0,50), ylim=c(0,2))
