library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")
non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")
#Here I combined the datasets in order to have a unified dataset for all utilites in the country.
combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)
#I am going to clean up some of the excess information here. Each row is almsot identical except for the zip code, so I am going to combine the rows into single columns based around the utility information.
CZ<-combinedzips %>%
distinct(utility_name, .keep_all = TRUE)
#This is the function to clear the 0’s from the residential, commercial, and industrial rates.
CZres <- CZ %>% filter(res_rate != 0)
#I am specifically interested in how the grid interconnection may affect the overall model so I am going to add in a new column that assigns a grid interconnection value to each state. I will do this for each cleaned dataset.
CZ_res_grids<-CZres %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
ifelse(state %in% c("AK"),"Alaska",
ifelse(state %in% c("HI"), "Hawaii",
ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_res_grids_sqrt<-CZ_res_grids %>% mutate(res_sqrt=sqrt(res_rate))
CZ_res_grids_sqrt$grid_connection <- relevel(factor(CZ_res_grids_sqrt$grid_connection), ref = "Texas")
#Here I add a new column labeled REG. I looked into it and I wanted to see the difference betweent the states for regulation versus the grid interconnection prices.
CZ_dereg<-CZ_res_grids_sqrt %>% mutate(REG=ifelse(state %in% c("TX", "CA", "IL", "NY", "PA", "OH", "NJ", "MD", "MA", "CN", "RI", "NH", "ME", "MI", "DE", "OR"),"Deregulated",
ifelse(state %in% c("AZ","WY","NM","ID","MT","NV","UT","CO","WA","AL","AK","IA","VA","AR","WV","NC","LA","IL","CT","WI","FL","GA","VT","HI","IN","KS","KY","TN","SC","MT","MN","MS","MO","DC","OK","ND","NE","SD"),"Regulated", NA)))
CZLM<-lm(res_sqrt~REG+grid_connection, data = CZ_dereg)
summary(CZLM)
##
## Call:
## lm(formula = res_sqrt ~ REG + grid_connection, data = CZ_dereg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.168599 -0.019335 -0.001341 0.018006 0.236410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.365503 0.004725 77.361 < 2e-16 ***
## REGRegulated -0.035742 0.003050 -11.720 < 2e-16 ***
## grid_connectionAlaska 0.175550 0.014029 12.514 < 2e-16 ***
## grid_connectionEastern 0.027195 0.005500 4.945 8.71e-07 ***
## grid_connectionHawaii 0.330308 0.021085 15.665 < 2e-16 ***
## grid_connectionWestern 0.013898 0.006002 2.316 0.0207 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04064 on 1188 degrees of freedom
## Multiple R-squared: 0.2893, Adjusted R-squared: 0.2863
## F-statistic: 96.73 on 5 and 1188 DF, p-value: < 2.2e-16
plot(CZLM)
raintest(CZLM)
##
## Rainbow test
##
## data: CZLM
## Rain = 1.7779, df1 = 597, df2 = 591, p-value = 1.724e-12
durbinWatsonTest(CZLM)
## lag Autocorrelation D-W Statistic p-value
## 1 0.09743654 1.805011 0
## Alternative hypothesis: rho != 0
#General plot of model + BPTEST
plot(CZLM)
bptest(CZLM)
##
## studentized Breusch-Pagan test
##
## data: CZLM
## BP = 182.18, df = 5, p-value < 2.2e-16
#Here is the QQplot and shap test. We talked about the shap test not being significant but the data passes a visual test of normality.
plot(CZLM, which=2)
shapiro.test(CZ_dereg$res_sqrt)
##
## Shapiro-Wilk normality test
##
## data: CZ_dereg$res_sqrt
## W = 0.8276, p-value < 2.2e-16
#Cant do corellation as only one column is numeric.
vif(CZLM)
## GVIF Df GVIF^(1/(2*Df))
## REG 1.259148 1 1.122117
## grid_connection 1.259148 4 1.029223
CZGLM<-glm(res_rate~grid_connection + REG,data = CZ_dereg,family = Gamma(link=log))
summary(CZGLM)
##
## Call:
## glm(formula = res_rate ~ grid_connection + REG, family = Gamma(link = log),
## data = CZ_dereg)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.00748 0.02569 -78.131 < 2e-16 ***
## grid_connectionAlaska 0.90572 0.07629 11.872 < 2e-16 ***
## grid_connectionEastern 0.16325 0.02991 5.458 5.85e-08 ***
## grid_connectionHawaii 1.38864 0.11467 12.110 < 2e-16 ***
## grid_connectionWestern 0.10259 0.03264 3.143 0.00171 **
## REGRegulated -0.21076 0.01659 -12.708 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.04885249)
##
## Null deviance: 78.672 on 1193 degrees of freedom
## Residual deviance: 55.419 on 1188 degrees of freedom
## AIC: -5113.6
##
## Number of Fisher Scoring iterations: 4
AIC(CZLM,CZGLM)
## df AIC
## CZLM 7 -4252.196
## CZGLM 7 -5113.636
#6) What would you do to mitigate this assumption? Show your work.