Import and rename dataset

X2024_Dataset_Approved_Applications <- read_excel("D:/Escuela/UTSA Grad (2023-2025)/2025-1 Capstone/Energy burden/2024 Dataset- Approved Applications.xlsx")

Approved24 <- X2024_Dataset_Approved_Applications
rm(X2024_Dataset_Approved_Applications)

Add Random Numbers

Approved24$RandomNums <- sample(1:5773, nrow(Approved24), replace = FALSE)

Create a subset with random numbers up to 288

AppSub <- Approved24 %>%
     filter(Approved24$RandomNums < 289)

Load random sample with CPS Energy data

App24Sample <- read_excel("D:/Escuela/UTSA Grad (2023-2025)/2025-1 Capstone/Energy burden/2024 Dataset Sample.xlsx")

Create a subset with households with income

App24SampInc <- App24Sample %>%
     filter(App24Sample$`Annual  Income` > 0)

Creating energy burden variables

App24SampInc$EnBurdPre <- App24SampInc$`Cost of Consumption` / App24SampInc$`Annual  Income`

App24SampInc$EnBurdPost <- (App24SampInc$`Cost of Consumption` - App24SampInc$`Assistance Received`) / App24SampInc$`Annual  Income`

New tables created with households of more than zero income

OwnerStatSub <- App24SampInc %>%
     group_by(`Ownership Status`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

HomeTypeSub <- App24SampInc %>%
     group_by(`What type of home do you live in`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

HomeSizeSub <- App24SampInc %>%
     group_by(`Total people in household`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

AnnIncSub <- App24SampInc %>%
     group_by(`Annual  Income`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

EnBurdPreSub <- App24SampInc %>%
     group_by(`EnBurdPre`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

EnBurdPostSub <- App24SampInc %>%
     group_by(`EnBurdPost`) %>%
     summarise(Count = n(), Percentage = (n() / nrow(App24SampInc)) * 100)

Create a subset with households with income

App24Samp2 <- App24SampInc %>%
     filter(App24SampInc$EnBurdPost > 0)

Create subset with only relevant variables

BurdenSubset <- dplyr::select (App24Samp2, `Ownership Status`, `Total people in household`, `What type of home do you live in`, `Annual  Income`, `EnBurdPre`, `EnBurdPost`)

Run linear model on pre-assistance data to explain dependent variable by three independent variables

EnBurdPre_model <- lm(EnBurdPre~as.factor(`Ownership Status`)+`Total people in household`+as.factor(`What type of home do you live in`), data = BurdenSubset)
summary(EnBurdPre_model)
## 
## Call:
## lm(formula = EnBurdPre ~ as.factor(`Ownership Status`) + `Total people in household` + 
##     as.factor(`What type of home do you live in`), data = BurdenSubset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25101 -0.11091 -0.04593  0.03232  2.23744 
## 
## Coefficients:
##                                                          Estimate Std. Error
## (Intercept)                                               0.21493    0.06298
## as.factor(`Ownership Status`)Renter                      -0.04833    0.04989
## `Total people in household`                               0.02371    0.01142
## as.factor(`What type of home do you live in`)House       -0.11431    0.05424
## as.factor(`What type of home do you live in`)Mobile Home -0.14477    0.08595
##                                                          t value Pr(>|t|)    
## (Intercept)                                                3.413 0.000825 ***
## as.factor(`Ownership Status`)Renter                       -0.969 0.334295    
## `Total people in household`                                2.076 0.039540 *  
## as.factor(`What type of home do you live in`)House        -2.107 0.036734 *  
## as.factor(`What type of home do you live in`)Mobile Home  -1.684 0.094188 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2483 on 152 degrees of freedom
## Multiple R-squared:  0.04824,    Adjusted R-squared:  0.02319 
## F-statistic: 1.926 on 4 and 152 DF,  p-value: 0.1089

Testing for linearity:

plot(EnBurdPre_model,which=1)

Test for skewedness before applying log transformation:

hist(BurdenSubset$EnBurdPre, main="Histogram of EnBurdPre", xlab="Energy Burden", col="blue")

Log transformed model:

EnBurdPre_model_log <- lm(log(EnBurdPre) ~ as.factor(`Ownership Status`) + 
                          `Total people in household` + 
                          as.factor(`What type of home do you live in`), 
                          data = BurdenSubset)
summary(EnBurdPre_model_log)
## 
## Call:
## lm(formula = log(EnBurdPre) ~ as.factor(`Ownership Status`) + 
##     `Total people in household` + as.factor(`What type of home do you live in`), 
##     data = BurdenSubset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66525 -0.53169 -0.01743  0.44510  3.07201 
## 
## Coefficients:
##                                                          Estimate Std. Error
## (Intercept)                                              -2.13970    0.19027
## as.factor(`Ownership Status`)Renter                      -0.18270    0.15073
## `Total people in household`                               0.05223    0.03450
## as.factor(`What type of home do you live in`)House       -0.08019    0.16387
## as.factor(`What type of home do you live in`)Mobile Home -0.13613    0.25967
##                                                          t value Pr(>|t|)    
## (Intercept)                                              -11.246   <2e-16 ***
## as.factor(`Ownership Status`)Renter                       -1.212    0.227    
## `Total people in household`                                1.514    0.132    
## as.factor(`What type of home do you live in`)House        -0.489    0.625    
## as.factor(`What type of home do you live in`)Mobile Home  -0.524    0.601    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7501 on 152 degrees of freedom
## Multiple R-squared:  0.02215,    Adjusted R-squared:  -0.003586 
## F-statistic: 0.8606 on 4 and 152 DF,  p-value: 0.4892

Testing for linearity after log transformation:

plot(EnBurdPre_model_log,which=1)

Run EnBurdPre histogram

hist(BurdenSubset$EnBurdPre)

Rainbow test:

library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
raintest(EnBurdPre_model_log)
## 
##  Rainbow test
## 
## data:  EnBurdPre_model_log
## Rain = 1.1167, df1 = 79, df2 = 73, p-value = 0.3171

Durbin-Watson test to test for independence of errors

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
durbinWatsonTest(EnBurdPre_model_log)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.0558342      1.887888   0.486
##  Alternative hypothesis: rho != 0

Test for homoscedasticity

library(lmtest)
bptest(EnBurdPre_model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  EnBurdPre_model_log
## BP = 11.285, df = 4, p-value = 0.02355

Test for normality of residuals

shapiro.test(EnBurdPre_model_log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  EnBurdPre_model_log$residuals
## W = 0.96031, p-value = 0.0001812

Testing for multicollinearity

vif(EnBurdPre_model_log)
##                                                   GVIF Df GVIF^(1/(2*Df))
## as.factor(`Ownership Status`)                 1.556512  1        1.247603
## `Total people in household`                   1.072733  1        1.035728
## as.factor(`What type of home do you live in`) 1.622992  2        1.128701

Run Generalized Linear Model on pre-assistance data to explain dependent variable by three independent variables

GLMEnBurdPre_model <- glm(EnBurdPre ~ as.factor(`Ownership Status`) + 
                  `Total people in household` + 
                  as.factor(`What type of home do you live in`), 
                  family = Gamma(link = "log"), data = BurdenSubset)
summary(GLMEnBurdPre_model)
## 
## Call:
## glm(formula = EnBurdPre ~ as.factor(`Ownership Status`) + `Total people in household` + 
##     as.factor(`What type of home do you live in`), family = Gamma(link = "log"), 
##     data = BurdenSubset)
## 
## Coefficients:
##                                                          Estimate Std. Error
## (Intercept)                                               -1.5971     0.3022
## as.factor(`Ownership Status`)Renter                       -0.2938     0.2394
## `Total people in household`                                0.1170     0.0548
## as.factor(`What type of home do you live in`)House        -0.5276     0.2603
## as.factor(`What type of home do you live in`)Mobile Home  -0.6075     0.4125
##                                                          t value Pr(>|t|)    
## (Intercept)                                               -5.284 4.31e-07 ***
## as.factor(`Ownership Status`)Renter                       -1.227   0.2216    
## `Total people in household`                                2.135   0.0344 *  
## as.factor(`What type of home do you live in`)House        -2.027   0.0444 *  
## as.factor(`What type of home do you live in`)Mobile Home  -1.473   0.1429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.419838)
## 
##     Null deviance: 116.99  on 156  degrees of freedom
## Residual deviance: 103.81  on 152  degrees of freedom
## AIC: -272.28
## 
## Number of Fisher Scoring iterations: 10

Comparing AIC between models

AIClm <- AIC(EnBurdPre_model)
print(AIClm)
## [1] 15.02721
AIClog <- AIC(EnBurdPre_model_log)
print(AIClog)
## [1] 362.1798
AICGLM <- AIC(GLMEnBurdPre_model)
print(AICGLM)
## [1] -272.2797

Run Generalized Linear Model on pre-assistance data to explain dependent variable by three independent variables

GLMEnBurdPost_model <- glm(EnBurdPost ~ as.factor(`Ownership Status`) + 
                  `Total people in household` + 
                  as.factor(`What type of home do you live in`), 
                  family = Gamma(link = "log"), data = BurdenSubset)
summary(GLMEnBurdPost_model)
## 
## Call:
## glm(formula = EnBurdPost ~ as.factor(`Ownership Status`) + `Total people in household` + 
##     as.factor(`What type of home do you live in`), family = Gamma(link = "log"), 
##     data = BurdenSubset)
## 
## Coefficients:
##                                                          Estimate Std. Error
## (Intercept)                                              -2.54434    0.39351
## as.factor(`Ownership Status`)Renter                      -0.30467    0.31175
## `Total people in household`                               0.09247    0.07135
## as.factor(`What type of home do you live in`)House       -0.87955    0.33892
## as.factor(`What type of home do you live in`)Mobile Home -1.00663    0.53705
##                                                          t value Pr(>|t|)    
## (Intercept)                                               -6.466  1.3e-09 ***
## as.factor(`Ownership Status`)Renter                       -0.977   0.3300    
## `Total people in household`                                1.296   0.1970    
## as.factor(`What type of home do you live in`)House        -2.595   0.0104 *  
## as.factor(`What type of home do you live in`)Mobile Home  -1.874   0.0628 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 2.406873)
## 
##     Null deviance: 271.92  on 156  degrees of freedom
## Residual deviance: 243.89  on 152  degrees of freedom
## AIC: -653.82
## 
## Number of Fisher Scoring iterations: 12

Create table to compare energy burdens by ownership status

EnBurdCompOwn <- BurdenSubset %>%
 group_by(`Ownership Status`) %>%
 summarise(across(c(EnBurdPre,EnBurdPost), mean))

Boxplot to compare energy burdens by ownership status

ggplot(BurdenSubset, aes(x = `Ownership Status`)) +
  geom_boxplot(aes(y = EnBurdPre, color = "Before assistance"), position = position_dodge(width = 0.8), width = 0.4) +
  geom_boxplot(aes(y = EnBurdPost, color = "After assistance"), position = position_dodge(width = 0.8), width = 0.4) +
  labs(title = "Comparison of Before and After Assistance by Ownership Status",
       x = "Ownership Status",
       y = "Energy Burden") +
  scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
  theme_minimal()

Boxplot to compare energy burdens by household size

ggplot(BurdenSubset, aes(x = `Total people in household`)) +
  geom_point(aes(y = EnBurdPre, color = "Before assistance"), alpha = 0.6) +
  geom_point(aes(y = EnBurdPost, color = "After assistance"), alpha = 0.6) +
  labs(title = "Comparison of Before and After Assistance by Household Size",
       x = "Ownership Status",
       y = "Energy Burden") +
  scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
  theme_minimal()

Create table to compare energy burdens by type of home

EnBurdCompType <- BurdenSubset %>%
 group_by(`What type of home do you live in`) %>%
 summarise(across(c(EnBurdPre,EnBurdPost), mean))

Boxplot to compare energy burdens by type of home

ggplot(BurdenSubset, aes(x = `What type of home do you live in`)) +
  geom_boxplot(aes(y = EnBurdPre, color = "Before assistance"), position = position_dodge(width = 0.8), width = 0.4) +
  geom_boxplot(aes(y = EnBurdPost, color = "After assistance"), position = position_dodge(width = 0.8), width = 0.4) +
  labs(title = "Comparison of Before and After Assistance by Type of Home",
       x = "Type of Home",
       y = "Energy Burden") +
  scale_color_manual(values = c("Before assistance" = "red", "After assistance" = "green")) +
  theme_minimal()