library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
wd<-getwd()
ECD<-read_xlsx("ECD.xlsx")
ECD_frame <- data.frame(Consumption = ECD$`2023 Total Usage`, HHM= ECD$`Total people in household`, Income = ECD$`Annual Household Income`, Ownership = as.factor(ECD$`Ownership Status`))
head(ECD_frame)
## Consumption HHM Income Ownership
## 1 8804 1 13848.0 Renter
## 2 5892 1 7356.0 Renter
## 3 5814 1 18840.0 Renter
## 4 7730 1 15234.0 Renter
## 5 8601 1 11223.6 Renter
## 6 16841 1 10968.0 Renter
summary(ECD_frame)
## Consumption HHM Income Ownership
## Min. : 5814 Min. : 1.000 Min. : 0 Owner :123
## 1st Qu.:14526 1st Qu.: 2.000 1st Qu.:11424 Renter:176
## Median :18761 Median : 4.000 Median :20928
## Mean :21008 Mean : 3.766 Mean :23336
## 3rd Qu.:25755 3rd Qu.: 5.000 3rd Qu.:30460
## Max. :79495 Max. :12.000 Max. :91921
ECD_lm<-lm(Consumption ~ HHM + Income + Ownership, data = ECD_frame)
summary(ECD_lm)
##
## Call:
## lm(formula = Consumption ~ HHM + Income + Ownership, data = ECD_frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17282 -5915 -1849 3765 50734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.811e+04 1.614e+03 11.224 < 2e-16 ***
## HHM 1.506e+03 2.623e+02 5.740 2.34e-08 ***
## Income 8.884e-03 3.616e-02 0.246 0.806
## OwnershipRenter -5.064e+03 1.173e+03 -4.319 2.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9769 on 295 degrees of freedom
## Multiple R-squared: 0.154, Adjusted R-squared: 0.1454
## F-statistic: 17.9 on 3 and 295 DF, p-value: 1.063e-10
ggplot(ECD_frame, aes(x=HHM)) + geom_histogram(bins=6)
ggplot(ECD_frame, aes(x=Income)) + geom_histogram(bins=6)
ggplot(ECD_frame, aes(x=Ownership, y=Consumption)) + geom_boxplot()
plot(ECD_lm, which = 1)
###PLOT: POINTS ARE SCATTERED SOMEWHAT AROUND THE STRAIGHT LINE. ALTHOUGH THERE ARE DEVIATIONS, THE LINE IS MOSTLY STRAIGHT, SUGGESTING THAT THE RELATIONSHIP BETWEEN VARIABLES IS MOSTLY LINEAR- ASSUMPTION MET.
raintest(ECD_lm)
##
## Rainbow test
##
## data: ECD_lm
## Rain = 0.54513, df1 = 150, df2 = 145, p-value = 0.9999
###RAINBOW TEST: HERE THE P VALUE IS HIGH, MEANING THE DATA IS LINEAR- ASSUMPTION MET.
###b)Independence of errors (durbin-watson)
durbinWatsonTest(ECD_lm)
## lag Autocorrelation D-W Statistic p-value
## 1 0.08565215 1.827332 0.096
## Alternative hypothesis: rho != 0
###DURBIN-WATSON TEST: HERE THE P-VALUE IS LESS THAN 0.5 (0.128),THIS MEANS THE ERRORS ARE NOT INDEPENDENT. HOWEVER THE D-W STATISTIC IS 1.827332, WHICH IS VERY CLOSE TO 2 MEANING ASSUMPTION IS MOST LIKELY MET.
plot(ECD_lm,which=3)
###FITTED PLOT: LINE IS PRETTY STRAIGHT AND THE POINTS CLUSTERED SOMEWHAT EVENLY AROUND IT, I WOULD SAY THIS MEETS THE ASSUMPTIONS.
bptest(ECD_lm)
##
## studentized Breusch-Pagan test
##
## data: ECD_lm
## BP = 3.7706, df = 3, p-value = 0.2873
###BP TEST: HERE THE P VALUE IS GREATER THAN 0.05 (P-VALUE= 0.2873). SINCE THE P VALUE IS NOT SIGNIFICANT, HETEROSCEDASCITY CAN BE REJECTED AND HOMOSCEDASTICITY CAN BE ASSUMED- ASSUMPTION MET
###d)Normality of residuals (QQ plot, shapiro test)
plot(ECD_lm,which=2)
###QQ PLOT: HERE THERE ARE OBSERVATIONS THAT DO NOT LIE ALONG THE DOTTED LINE, ASSUMPTION IS VIOLATED
shapiro.test(ECD_lm$residuals)
##
## Shapiro-Wilk normality test
##
## data: ECD_lm$residuals
## W = 0.86246, p-value = 1.185e-15
###SHAPIRO WILK TEST: TEST SHOWS P VALUE IS LESS THAN 0.05 MEANING THE DATA IS PROBABLY NOT NORMAL AND ASSUMPTION IS VIOLATED
###e)No multicolinarity
vif(ECD_lm)
## HHM Income Ownership
## 1.004355 1.047422 1.042983
###VIF: HERE THERE IS NO VIF THAT IS GREATER THAN 10 WHICH COULD MEAN THEY ARE NOT CORRELATED WITH OTHER VARIABLES- ASSUMPTION IS MET
###5) If your model violates an assumption, which one?
###MODEL VIOLATES: NORMALITY OF RESIDUALS
###6) What would you do to mitigate this assumption? Show your work.
###IN ORDER TO MITIGATE I WOULD ADJUST MODEL USING LOG
###HAVE TO FILTER DUE TO 0'S IN VARIABLE "ANNUAL HOUSEHOLD INCOME"
ECD_lm_filtered <- ECD_frame %>% filter(Consumption > 0, HHM > 0, Income > 0)
ECD_lm_log <- lm(log(Consumption) ~ log(HHM) + log(Income)+ Ownership, data = ECD_lm_filtered)
plot(ECD_lm,which=2)
plot(ECD_lm_log, which=2)
summary(ECD_lm_log)
##
## Call:
## lm(formula = log(Consumption) ~ log(HHM) + log(Income) + Ownership,
## data = ECD_lm_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18160 -0.25928 -0.01066 0.24924 1.58854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.44958 0.37631 25.111 < 2e-16 ***
## log(HHM) 0.27200 0.03992 6.814 6.01e-11 ***
## log(Income) 0.02264 0.03753 0.603 0.547
## OwnershipRenter -0.23010 0.05333 -4.314 2.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4268 on 274 degrees of freedom
## Multiple R-squared: 0.2109, Adjusted R-squared: 0.2023
## F-statistic: 24.41 on 3 and 274 DF, p-value: 4.961e-14