Import and rename dataset
X2024_Dataset_Approved_Applications <- read_excel("D:/Escuela/UTSA Grad (2023-2025)/2025-1 Capstone/Energy burden/2024 Dataset- Approved Applications.xlsx")
Approved24 <- X2024_Dataset_Approved_Applications
rm(X2024_Dataset_Approved_Applications)
Add Random Numbers
Approved24$RandomNums <- sample(1:5773, nrow(Approved24), replace = FALSE)
Create a subset with random numbers up to 288
AppSub <- Approved24 %>%
filter(Approved24$RandomNums < 289)
Load random sample with CPS Energy data
App24Sample <- read_excel("D:/Escuela/UTSA Grad (2023-2025)/2025-1 Capstone/Energy burden/2024 Dataset Sample.xlsx")
Create a subset with households with income
App24SampInc <- App24Sample %>%
filter(App24Sample$`Annual Income` > 0)
Creating energy burden variables
App24SampInc$EnBurdPre <- App24SampInc$`Cost of Consumption` / App24SampInc$`Annual Income`
App24SampInc$EnBurdPost <- (App24SampInc$`Cost of Consumption` - App24SampInc$`Assistance Received`) / App24SampInc$`Annual Income`
New tables created with households of more than zero income
OwnerStatSub <- App24SampInc %>%
group_by(`Ownership Status`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
HomeTypeSub <- App24SampInc %>%
group_by(`What type of home do you live in`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
HomeSizeSub <- App24SampInc %>%
group_by(`Total people in household`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
AnnIncSub <- App24SampInc %>%
group_by(`Annual Income`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
EnBurdPreSub <- App24SampInc %>%
group_by(`EnBurdPre`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
EnBurdPostSub <- App24SampInc %>%
group_by(`EnBurdPost`) %>%
summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)
Create a subset with households with income
App24Samp2 <- App24SampInc %>%
filter(App24SampInc$EnBurdPost > 0)
Create subset with only relevant variables
BurdenSubset <- dplyr::select (App24Samp2, `Ownership Status`, `Total people in household`, `What type of home do you live in`, `Annual Income`, `EnBurdPre`, `EnBurdPost`)
Run linear model on pre-assistance data to explain dependent variable by three independent variables
EnBurdPre_model <- lm(EnBurdPre~as.factor(`Ownership Status`)+`Total people in household`+as.factor(`What type of home do you live in`), data = BurdenSubset)
summary(EnBurdPre_model)
##
## Call:
## lm(formula = EnBurdPre ~ as.factor(`Ownership Status`) + `Total people in household` +
## as.factor(`What type of home do you live in`), data = BurdenSubset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25101 -0.11091 -0.04593 0.03232 2.23744
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 0.21493 0.06298
## as.factor(`Ownership Status`)Renter -0.04833 0.04989
## `Total people in household` 0.02371 0.01142
## as.factor(`What type of home do you live in`)House -0.11431 0.05424
## as.factor(`What type of home do you live in`)Mobile Home -0.14477 0.08595
## t value Pr(>|t|)
## (Intercept) 3.413 0.000825 ***
## as.factor(`Ownership Status`)Renter -0.969 0.334295
## `Total people in household` 2.076 0.039540 *
## as.factor(`What type of home do you live in`)House -2.107 0.036734 *
## as.factor(`What type of home do you live in`)Mobile Home -1.684 0.094188 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2483 on 152 degrees of freedom
## Multiple R-squared: 0.04824, Adjusted R-squared: 0.02319
## F-statistic: 1.926 on 4 and 152 DF, p-value: 0.1089
Testing for linearity:
plot(EnBurdPre_model,which=1)
Test for skewedness before applying log transformation:
hist(BurdenSubset$EnBurdPre, main="Histogram of EnBurdPre", xlab="Energy Burden", col="blue")
Log transformed model:
EnBurdPre_model_log <- lm(log(EnBurdPre) ~ as.factor(`Ownership Status`) +
`Total people in household` +
as.factor(`What type of home do you live in`),
data = BurdenSubset)
summary(EnBurdPre_model_log)
##
## Call:
## lm(formula = log(EnBurdPre) ~ as.factor(`Ownership Status`) +
## `Total people in household` + as.factor(`What type of home do you live in`),
## data = BurdenSubset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66525 -0.53169 -0.01743 0.44510 3.07201
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.13970 0.19027
## as.factor(`Ownership Status`)Renter -0.18270 0.15073
## `Total people in household` 0.05223 0.03450
## as.factor(`What type of home do you live in`)House -0.08019 0.16387
## as.factor(`What type of home do you live in`)Mobile Home -0.13613 0.25967
## t value Pr(>|t|)
## (Intercept) -11.246 <2e-16 ***
## as.factor(`Ownership Status`)Renter -1.212 0.227
## `Total people in household` 1.514 0.132
## as.factor(`What type of home do you live in`)House -0.489 0.625
## as.factor(`What type of home do you live in`)Mobile Home -0.524 0.601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7501 on 152 degrees of freedom
## Multiple R-squared: 0.02215, Adjusted R-squared: -0.003586
## F-statistic: 0.8606 on 4 and 152 DF, p-value: 0.4892
Testing for linearity after log transformation:
plot(EnBurdPre_model_log,which=1)
Run EnBurdPre histogram
hist(BurdenSubset$EnBurdPre)
Rainbow test:
library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
raintest(EnBurdPre_model_log)
##
## Rainbow test
##
## data: EnBurdPre_model_log
## Rain = 1.1167, df1 = 79, df2 = 73, p-value = 0.3171
Durbin-Watson test to test for independence of errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
durbinWatsonTest(EnBurdPre_model_log)
## lag Autocorrelation D-W Statistic p-value
## 1 0.0558342 1.887888 0.486
## Alternative hypothesis: rho != 0
Test for homoscedasticity
library(lmtest)
bptest(EnBurdPre_model_log)
##
## studentized Breusch-Pagan test
##
## data: EnBurdPre_model_log
## BP = 11.285, df = 4, p-value = 0.02355
Test for normality of residuals
shapiro.test(EnBurdPre_model_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: EnBurdPre_model_log$residuals
## W = 0.96031, p-value = 0.0001812
Testing for multicollinearity
vif(EnBurdPre_model_log)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(`Ownership Status`) 1.556512 1 1.247603
## `Total people in household` 1.072733 1 1.035728
## as.factor(`What type of home do you live in`) 1.622992 2 1.128701
Run Generalized Linear Model on pre-assistance data to explain dependent variable by three independent variables
GLMEnBurdPre_model <- glm(EnBurdPre ~ as.factor(`Ownership Status`) +
`Total people in household` +
as.factor(`What type of home do you live in`),
family = Gamma(link = "log"), data = BurdenSubset)
summary(GLMEnBurdPre_model)
##
## Call:
## glm(formula = EnBurdPre ~ as.factor(`Ownership Status`) + `Total people in household` +
## as.factor(`What type of home do you live in`), family = Gamma(link = "log"),
## data = BurdenSubset)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -1.5971 0.3022
## as.factor(`Ownership Status`)Renter -0.2938 0.2394
## `Total people in household` 0.1170 0.0548
## as.factor(`What type of home do you live in`)House -0.5276 0.2603
## as.factor(`What type of home do you live in`)Mobile Home -0.6075 0.4125
## t value Pr(>|t|)
## (Intercept) -5.284 4.31e-07 ***
## as.factor(`Ownership Status`)Renter -1.227 0.2216
## `Total people in household` 2.135 0.0344 *
## as.factor(`What type of home do you live in`)House -2.027 0.0444 *
## as.factor(`What type of home do you live in`)Mobile Home -1.473 0.1429
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.419838)
##
## Null deviance: 116.99 on 156 degrees of freedom
## Residual deviance: 103.81 on 152 degrees of freedom
## AIC: -272.28
##
## Number of Fisher Scoring iterations: 10
Comparing AIC between models
AIClm <- AIC(EnBurdPre_model)
print(AIClm)
## [1] 15.02721
AIClog <- AIC(EnBurdPre_model_log)
print(AIClog)
## [1] 362.1798
AICGLM <- AIC(GLMEnBurdPre_model)
print(AICGLM)
## [1] -272.2797
Run Generalized Linear Model on pre-assistance data to explain dependent variable by three independent variables
GLMEnBurdPost_model <- glm(EnBurdPost ~ as.factor(`Ownership Status`) +
`Total people in household` +
as.factor(`What type of home do you live in`),
family = Gamma(link = "log"), data = BurdenSubset)
summary(GLMEnBurdPost_model)
##
## Call:
## glm(formula = EnBurdPost ~ as.factor(`Ownership Status`) + `Total people in household` +
## as.factor(`What type of home do you live in`), family = Gamma(link = "log"),
## data = BurdenSubset)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.54434 0.39351
## as.factor(`Ownership Status`)Renter -0.30467 0.31175
## `Total people in household` 0.09247 0.07135
## as.factor(`What type of home do you live in`)House -0.87955 0.33892
## as.factor(`What type of home do you live in`)Mobile Home -1.00663 0.53705
## t value Pr(>|t|)
## (Intercept) -6.466 1.3e-09 ***
## as.factor(`Ownership Status`)Renter -0.977 0.3300
## `Total people in household` 1.296 0.1970
## as.factor(`What type of home do you live in`)House -2.595 0.0104 *
## as.factor(`What type of home do you live in`)Mobile Home -1.874 0.0628 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 2.406873)
##
## Null deviance: 271.92 on 156 degrees of freedom
## Residual deviance: 243.89 on 152 degrees of freedom
## AIC: -653.82
##
## Number of Fisher Scoring iterations: 12
Create table to compare energy burdens by ownership status
EnBurdCompOwn <- BurdenSubset %>%
group_by(`Ownership Status`) %>%
summarise(across(c(EnBurdPre,EnBurdPost), mean))
Boxplot to compare energy burdens by ownership status
ggplot(BurdenSubset, aes(x = `Ownership Status`)) +
geom_boxplot(aes(y = EnBurdPre, color = "Before assistance"), position = position_dodge(width = 0.8), width = 0.4) +
geom_boxplot(aes(y = EnBurdPost, color = "After assistance"), position = position_dodge(width = 0.8), width = 0.4) +
labs(title = "Comparison of Before and After Assistance by Ownership Status",
x = "Ownership Status",
y = "Energy Burden") +
scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
theme_minimal()
Boxplot to compare energy burdens by household size
ggplot(BurdenSubset, aes(x = `Total people in household`)) +
geom_point(aes(y = EnBurdPre, color = "Before assistance"), alpha = 0.6) +
geom_point(aes(y = EnBurdPost, color = "After assistance"), alpha = 0.6) +
labs(title = "Comparison of Before and After Assistance by Household Size",
x = "Ownership Status",
y = "Energy Burden") +
scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
theme_minimal()
Create table to compare energy burdens by type of home
EnBurdCompType <- BurdenSubset %>%
group_by(`What type of home do you live in`) %>%
summarise(across(c(EnBurdPre,EnBurdPost), mean))
Boxplot to compare energy burdens by type of home
ggplot(BurdenSubset, aes(x = `What type of home do you live in`)) +
geom_boxplot(aes(y = EnBurdPre, color = "Before assistance"), position = position_dodge(width = 0.8), width = 0.4) +
geom_boxplot(aes(y = EnBurdPost, color = "After assistance"), position = position_dodge(width = 0.8), width = 0.4) +
labs(title = "Comparison of Before and After Assistance by Type of Home",
x = "Type of Home",
y = "Energy Burden") +
scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
theme_minimal()