library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
#1 Load Data & Set Assemble Variables
setwd("C:/Users/cramo/OneDrive/Desktop/My Class Stuff/Monday Class")
ahs.household.data <- read_csv("household.csv")
## Rows: 55669 Columns: 1180
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (635): CONTROL, JACPRIMARY, JACSECNDRY, JADEQUACY, JAIRRATE, JBATHEXCLU,...
## dbl (545): TOTROOMS, PERPOVLVL, OUTAGEFRQ, RENT, DINING, LAUNDY, RATINGHS, R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ahs.household.data <- ahs.household.data %>% mutate(tot.cost.burden = ((TOTHCAMT * 12) + MAINTAMT) / HINCP)
clean.research.data <- ahs.household.data %>% filter(is.finite(tot.cost.burden), tot.cost.burden > 0)
clean.research.data <- clean.research.data %>% mutate(housing.age = 2023 - YRBUILT)
#2 Create the Linear Model
affordable.housing.model <- lm(tot.cost.burden ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS, data = clean.research.data)
#3 Test the conditions for linearity
# a) Linearity (plot and raintest)
raintest(affordable.housing.model)
##
## Rainbow test
##
## data: affordable.housing.model
## Rain = 0.18116, df1 = 27318, df2 = 27312, p-value = 1
plot(affordable.housing.model, which = 1)
# b) Independence of errors (durbin-watson)
durbinWatsonTest(affordable.housing.model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.0003733763 1.999253 0.154
## Alternative hypothesis: rho != 0
# c) Homoscedasticity (plot, bptest)
plot(affordable.housing.model,which=3)
bptest(affordable.housing.model)
##
## studentized Breusch-Pagan test
##
## data: affordable.housing.model
## BP = 19.235, df = 4, p-value = 0.0007066
# d) Normality of residuals (QQ plot, shapiro test)
plot(affordable.housing.model,which=2)
ks.test(affordable.housing.model$residuals, "pnorm", mean = mean(affordable.housing.model$residuals), sd = sd(affordable.housing.model$residuals))
## Warning in ks.test.default(affordable.housing.model$residuals, "pnorm", : ties
## should not be present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: affordable.housing.model$residuals
## D = 0.47497, p-value < 2.2e-16
## alternative hypothesis: two-sided
# e) No multicolinarity (VIF, cor)
vif(affordable.housing.model)
## housing.age MARKETVAL PERPOVLVL RATINGHS
## 1.009275 1.054287 1.420175 1.350810
kitchen_sink_vars <- clean.research.data %>% dplyr::select(housing.age, MARKETVAL, PERPOVLVL, RATINGHS)
cor(kitchen_sink_vars)
## housing.age MARKETVAL PERPOVLVL RATINGHS
## housing.age 1.00000000 -0.03885332 -0.09352712 -0.05383571
## MARKETVAL -0.03885332 1.00000000 0.21626259 0.05257105
## PERPOVLVL -0.09352712 0.21626259 1.00000000 0.50620974
## RATINGHS -0.05383571 0.05257105 0.50620974 1.00000000
###4&5 Which assumptions of linearity does your model meet (or not) #a The model meets the conditions for linearity. The residual vs fitted graph appears to be linear, as the fitted line is flat and the values are closely grouped with the exception of a few outliers that jump up significantly. The Rainbow test returns a p-value of 1, which is greater than .05, meaning the null hypothesis is not rejected and the linearity assumption is met. #b The model also meets the conditions for independence of errors. The Durbin-Watson test returns a p-value of 0.142, which is greater than .05, meaning the null hypothesis is not rejected and the errors are independent. Additionally, the D-W statistic is very close to 2 at 1.99. #c The model does not meet the conditions for homoscedasticity. The Breusch-Pagan test returns a p-value of 0.0007066, which is less than .05, meaning the null hypothesis is rejected and the variance of the errors is not constant. This is also visible in the plot, where the residuals are tightly clustered at lower fitted values, then spread out much more in the middle, and narrow again as the fitted values increase, showing that the spread is not consistent across the model. #d The model does not meet the normality of residuals assumption. The KS test returns a p-value less than 2.2e-16, which is less than .05, meaning the null hypothesis is rejected and the residuals are not normally distributed. This is also visible in the Q-Q plot, where most of the values are tightly grouped near the line at the lower end, but the upper end pulls away sharply, with several large outliers causing a heavy right tail. #e The model meets the conditions for multicollinearity. All VIF values are well below 10, ranging from about 1.00 to 1.42, indicating that there is no significant multicollinearity between the independent variables. The correlation matrix also supports this, as most relationships between variables are weak to moderate. Some correlations are negative, meaning the variables move in opposite directions, but none are strong enough to cause concern. #f The model meets the assumptions of linearity, independence of errors, and multicollinearity, but does not meet the assumptions of homoscedasticity and normality.
#To mitigate this issue I would run a log on the DV
logdv.affordable.housing.model <- lm(log(tot.cost.burden) ~ housing.age + MARKETVAL + PERPOVLVL + RATINGHS, data = clean.research.data)
#Mitigated Homoscedasticity
plot(logdv.affordable.housing.model,which=3)
bptest(logdv.affordable.housing.model)
##
## studentized Breusch-Pagan test
##
## data: logdv.affordable.housing.model
## BP = 632.91, df = 4, p-value < 2.2e-16
#Mitigated Normality of Residuals
plot(logdv.affordable.housing.model,which=2)
ks.test(logdv.affordable.housing.model$residuals, "pnorm", mean = mean(logdv.affordable.housing.model$residuals), sd = sd(logdv.affordable.housing.model$residuals))
## Warning in ks.test.default(logdv.affordable.housing.model$residuals, "pnorm", :
## ties should not be present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: logdv.affordable.housing.model$residuals
## D = 0.075341, p-value < 2.2e-16
## alternative hypothesis: two-sided
###To mitigate these issues, I applied a log transformation to the dependent variable. This improved the normality of the residuals, as the Q-Q plot shows a less extreme deviation from the line compared to the original model. However, the KS test remains significant with a p-value less than 2.2e-16, indicating that the residuals are still not normally distributed, likely due to the large sample size making the test very sensitive. The model also still does not meet the homoscedasticity assumption. The Breusch-Pagan test remains significant with a p-value less than 2.2e-16, indicating that the variance of the errors is still not constant.
#TESTTo mitigate this issue I would run a log on the linear model
clean.log.research.data <- clean.research.data %>%
filter(housing.age > 0, MARKETVAL > 0, PERPOVLVL > 0, RATINGHS > 0)
log.all.affordable.housing.model <- lm(log(tot.cost.burden) ~ log(housing.age) + log(MARKETVAL) + log(PERPOVLVL) + log(RATINGHS), data = clean.log.research.data)
#Mitigated Homoscedasticity
plot(log.all.affordable.housing.model,which=3)
bptest(log.all.affordable.housing.model)
##
## studentized Breusch-Pagan test
##
## data: log.all.affordable.housing.model
## BP = 135.72, df = 4, p-value < 2.2e-16
#Mitigated Normality of Residuals
plot(log.all.affordable.housing.model,which=2)
ks.test(log.all.affordable.housing.model$residuals, "pnorm", mean = mean(log.all.affordable.housing.model$residuals), sd = sd(log.all.affordable.housing.model$residuals))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: log.all.affordable.housing.model$residuals
## D = 0.031752, p-value < 2.2e-16
## alternative hypothesis: two-sided