library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.5.2 âś” tibble 3.3.0
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
HousingData <- read_csv("Table9.csv")
## Rows: 3221 Columns: 152
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): source, sumlevel, geoid, name, st, cnty
## dbl (146): T9_est1, T9_est2, T9_est3, T9_est4, T9_est5, T9_est6, T9_est7, T9...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CHAS_renters <- HousingData %>%
mutate(
severe_prob_rate = (T9_est5 / T9_est1) * 100,
overcrowd_rate = (T9_est10 / T9_est1) * 100,
incomplete_rate = ((T9_est12 + T9_est13) / T9_est1) * 100
)
model <- lm(severe_prob_rate ~ overcrowd_rate + incomplete_rate, data = CHAS_renters)
summary(model)
##
## Call:
## lm(formula = severe_prob_rate ~ overcrowd_rate + incomplete_rate,
## data = CHAS_renters)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0394 -1.2284 -0.0352 1.3681 10.8438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.03935 0.04795 125.947 <2e-16 ***
## overcrowd_rate -0.63794 0.03476 -18.352 <2e-16 ***
## incomplete_rate -0.05017 0.02477 -2.025 0.0429 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.302 on 3218 degrees of freedom
## Multiple R-squared: 0.09888, Adjusted R-squared: 0.09832
## F-statistic: 176.6 on 2 and 3218 DF, p-value: < 2.2e-16
plot(model, which = 1)
raintest(model)
##
## Rainbow test
##
## data: model
## Rain = 1.5521, df1 = 1611, df2 = 1607, p-value < 2.2e-16
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.3254, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
plot(model, which = 3)
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 52.188, df = 2, p-value = 4.65e-12
plot(model, which = 2)
res <- resid(model)
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.9805, p-value < 2.2e-16
vif(model)
## overcrowd_rate incomplete_rate
## 1.011949 1.011949
#The model does not meet all of the assumptions. It violates linearity of variables and independence of errors. There is some issues with mormality of the residuals but this is a large dataset so that could influence the results.
#I would use a log transformed model to improve the fit.