library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
#The dataset I chose is the data collected by the DOE as part of reporting by electric utilities yearly operations. They collect data on its organizational structure, which zip codes it serves and the electricity rates for commerical, residential, and industrial customers.
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")
non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")
#Here I combined the datasets in order to have a unified dataset for all utilites in the country.
combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)
#I am going to clean up some of the excess information here. Each row is almsot identical except for the zip code, so I am going to combine the rows into single columns based around the utility information.
CZ<-combinedzips %>%
distinct(utility_name, .keep_all = TRUE)
#This is the function to clear the 0’s from the residential, commercial, and industrial rates.
CZres <- CZ %>% filter(res_rate != 0)
CZcom <- CZ %>% filter(comm_rate != 0)
CZind <- CZ %>% filter(ind_rate != 0)
#I am specifically interested in how the grid interconnection may affect the overall model so I am going to add in a new column that assigns a grid interconnection value to each state. I will do this for each cleaned dataset.
CZ_res_grids<-CZres %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
ifelse(state %in% c("AK"),"Alaska",
ifelse(state %in% c("HI"), "Hawaii",
ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_com_grids<-CZcom %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
ifelse(state %in% c("AK"),"Alaska",
ifelse(state %in% c("HI"), "Hawaii",
ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_ind_grids<-CZind %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
ifelse(state %in% c("AK"),"Alaska",
ifelse(state %in% c("HI"), "Hawaii",
ifelse(state %in% c("TX"),"Texas",NA))))))
#Let me run a normality test.
shapiro.test(CZ_res_grids$res_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_res_grids$res_rate
## W = 0.70434, p-value < 2.2e-16
shapiro.test(CZ_com_grids$comm_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_com_grids$comm_rate
## W = 0.74878, p-value < 2.2e-16
shapiro.test(CZ_ind_grids$ind_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_ind_grids$ind_rate
## W = 0.71233, p-value < 2.2e-16
#All the shapiro tests were very significant. I’ve done histograms in the past so I’m going to go ahead and do a log transformation to see if I can normalize them.
CZ_res_grids_sqrt<-CZ_res_grids %>% mutate(res_sqrt=sqrt(res_rate))
CZ_comm_grids_sqrt<-CZ_com_grids %>% mutate(comm_sqrt=sqrt(comm_rate))
CZ_ind_grids_sqrt<-CZ_ind_grids %>% mutate(ind_sqrt=sqrt(ind_rate))
#Test for normality again.
shapiro.test(CZ_res_grids_sqrt$res_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_res_grids_sqrt$res_rate
## W = 0.70434, p-value < 2.2e-16
shapiro.test(CZ_comm_grids_sqrt$comm_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_comm_grids_sqrt$comm_rate
## W = 0.74878, p-value < 2.2e-16
shapiro.test(CZ_ind_grids_sqrt$ind_rate)
##
## Shapiro-Wilk normality test
##
## data: CZ_ind_grids_sqrt$ind_rate
## W = 0.71233, p-value < 2.2e-16
#still significant. Here are some histograms of the data.
hist(CZ_res_grids_sqrt$res_sqrt)
hist(CZ_comm_grids_sqrt$comm_sqrt)
hist(CZ_ind_grids_sqrt$ind_sqrt)
#I also decided to relevel the factors to base the estimates around Texas in the linear models. Those functions are below.
CZ_res_grids_sqrt$grid_connection <- relevel(factor(CZ_res_grids_sqrt$grid_connection), ref = "Texas")
CZ_comm_grids_sqrt$grid_connection <- relevel(factor(CZ_comm_grids_sqrt$grid_connection), ref = "Texas")
CZ_ind_grids_sqrt$grid_connection <- relevel(factor(CZ_ind_grids_sqrt$grid_connection), ref = "Texas")
#Now I can attempt some linear models to see if anything is outragesously different. There are not a lot of numerical variables so there is going to be only one dependent and one independent for each linear model for now to test.
CZRLM<-lm(res_sqrt~grid_connection, data = CZ_res_grids_sqrt)
summary(CZRLM)
##
## Call:
## lm(formula = res_sqrt ~ grid_connection, data = CZ_res_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.178244 -0.021092 -0.004581 0.017548 0.262508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.365503 0.004988 73.274 <2e-16 ***
## grid_connectionAlaska 0.139808 0.014457 9.671 <2e-16 ***
## grid_connectionEastern -0.001804 0.005186 -0.348 0.7280
## grid_connectionHawaii 0.294566 0.022027 13.373 <2e-16 ***
## grid_connectionWestern -0.012200 0.005884 -2.073 0.0384 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04291 on 1189 degrees of freedom
## Multiple R-squared: 0.2072, Adjusted R-squared: 0.2045
## F-statistic: 77.67 on 4 and 1189 DF, p-value: < 2.2e-16
CZCLM<-lm(comm_sqrt~grid_connection, data = CZ_comm_grids_sqrt)
summary(CZCLM)
##
## Call:
## lm(formula = comm_sqrt ~ grid_connection, data = CZ_comm_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.176319 -0.024527 -0.003377 0.019414 0.262453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3490940 0.0050845 68.659 <2e-16 ***
## grid_connectionAlaska 0.1249122 0.0147362 8.477 <2e-16 ***
## grid_connectionEastern 0.0008251 0.0052863 0.156 0.8760
## grid_connectionHawaii 0.3014473 0.0224524 13.426 <2e-16 ***
## grid_connectionWestern -0.0121920 0.0059978 -2.033 0.0423 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04374 on 1186 degrees of freedom
## Multiple R-squared: 0.1944, Adjusted R-squared: 0.1917
## F-statistic: 71.55 on 4 and 1186 DF, p-value: < 2.2e-16
CZILM<-lm(ind_sqrt~grid_connection, data = CZ_ind_grids_sqrt)
summary(CZILM)
##
## Call:
## lm(formula = ind_sqrt ~ grid_connection, data = CZ_ind_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.122491 -0.029637 -0.007981 0.016964 0.282313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2996709 0.0062284 48.113 < 2e-16 ***
## grid_connectionAlaska 0.0680487 0.0209680 3.245 0.00121 **
## grid_connectionEastern 0.0007871 0.0064624 0.122 0.90308
## grid_connectionHawaii 0.3167972 0.0253000 12.522 < 2e-16 ***
## grid_connectionWestern -0.0017511 0.0072995 -0.240 0.81046
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04904 on 1043 degrees of freedom
## Multiple R-squared: 0.1453, Adjusted R-squared: 0.142
## F-statistic: 44.31 on 4 and 1043 DF, p-value: < 2.2e-16
#funny enough the square rooting actually hurt the R-squared values for the regressions, so I am going to put it back to regular rates.
CZRLM<-lm(res_rate~grid_connection, data = CZ_res_grids_sqrt)
summary(CZRLM)
##
## Call:
## lm(formula = res_rate ~ grid_connection, data = CZ_res_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.151253 -0.016463 -0.004913 0.011634 0.263985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.134327 0.004010 33.502 <2e-16 ***
## grid_connectionAlaska 0.134816 0.011621 11.601 <2e-16 ***
## grid_connectionEastern -0.000663 0.004168 -0.159 0.874
## grid_connectionHawaii 0.301900 0.017706 17.051 <2e-16 ***
## grid_connectionWestern -0.005681 0.004730 -1.201 0.230
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03449 on 1189 degrees of freedom
## Multiple R-squared: 0.2808, Adjusted R-squared: 0.2784
## F-statistic: 116 on 4 and 1189 DF, p-value: < 2.2e-16
CZCLM<-lm(comm_rate~grid_connection, data = CZ_comm_grids_sqrt)
summary(CZCLM)
##
## Call:
## lm(formula = comm_rate ~ grid_connection, data = CZ_comm_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.131910 -0.018438 -0.003822 0.012451 0.251087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.122867 0.003862 31.813 <2e-16 ***
## grid_connectionAlaska 0.113331 0.011194 10.125 <2e-16 ***
## grid_connectionEastern 0.001046 0.004015 0.261 0.794
## grid_connectionHawaii 0.300835 0.017055 17.639 <2e-16 ***
## grid_connectionWestern -0.005479 0.004556 -1.203 0.229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03322 on 1186 degrees of freedom
## Multiple R-squared: 0.2735, Adjusted R-squared: 0.2711
## F-statistic: 111.6 on 4 and 1186 DF, p-value: < 2.2e-16
CZILM<-lm(ind_rate~grid_connection, data = CZ_ind_grids_sqrt)
summary(CZILM)
##
## Call:
## lm(formula = ind_rate ~ grid_connection, data = CZ_ind_grids_sqrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.060790 -0.019425 -0.006964 0.008371 0.247161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.090912 0.004313 21.080 < 2e-16 ***
## grid_connectionAlaska 0.046191 0.014519 3.181 0.00151 **
## grid_connectionEastern 0.001549 0.004475 0.346 0.72924
## grid_connectionHawaii 0.289428 0.017518 16.522 < 2e-16 ***
## grid_connectionWestern 0.001796 0.005054 0.355 0.72244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03396 on 1043 degrees of freedom
## Multiple R-squared: 0.2213, Adjusted R-squared: 0.2183
## F-statistic: 74.11 on 4 and 1043 DF, p-value: < 2.2e-16
#So for each model they are roughly similar with the regression looking at the industrial rates having the lowest R-squared value.This means it accounts for less of the data, specifically about 21.83% going off of the adjusted R-squared. The one that accounts for the most of the models tested, we can see that the residential rate is the most predictive but it is still pretty poor with an adjusted R-square of .2784.
#For every model the Alaskan and Hawaiian grids were significantly different but the Western and Eastern Grids were not. The Alaskand and Hawaiian grids had P-values <2e-16 which implies they are incredibly singificant since the P-value is so incredibly low. The insignificant variables in this case are the Eastern and Western grids which are both >.05. It is interesting that the industrial model is different in this respect as the P-value is actually much higher going from .230 in the residential model to .72244 in the industrial model.
#In the residential model, two significant variables were the Alaskan and Hawaiian grids which had positive Beta coefficients of 0.134816 and 0.301900 respectively. I interpret this as for every 1 unit of the residential rates in Texas going up, the Alaskan residential rates and the Hawaiian residential rates would go up by 1 unit and 0.134816 and 0.301900 respectively. So this indicates a positive relationship and this would mean that on average Alaskan and Hawaiian rates would be higher by those degrees all else being equal and if the sample accounted for everything which we know it doesnt.
plot(CZRLM, which = 1)
#I believe the numbers should be based around the red line in an even spacing and should not be grouped or clustered together heavily. This causes issues in modeling and violates the assumption of linearity.