library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")
non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")
#Here I combined the datasets in order to have a unified dataset for all utilites in the country.
combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)
#I am going to clean up some of the excess information here. Each row is almsot identical except for the zip code, so I am going to combine the rows into single columns based around the utility information.
CZ<-combinedzips %>%
distinct(utility_name, .keep_all = TRUE)
#This is the function to clear the 0’s from the residential, commercial, and industrial rates.
CZres <- CZ %>% filter(res_rate != 0)
#I am specifically interested in how the grid interconnection may affect the overall model so I am going to add in a new column that assigns a grid interconnection value to each state. I will do this for each cleaned dataset.
CZ_res_grids<-CZres %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
ifelse(state %in% c("AK"),"Alaska",
ifelse(state %in% c("HI"), "Hawaii",
ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_res_grids_sqrt<-CZ_res_grids %>% mutate(res_sqrt=sqrt(res_rate))
CZ_res_grids_sqrt$grid_connection <- relevel(factor(CZ_res_grids_sqrt$grid_connection), ref = "Texas")
#Here I add a new column labeled REG. I looked into it and I wanted to see the difference betweent the states for regulation versus the grid interconnection prices.
CZ_dereg<-CZ_res_grids_sqrt %>% mutate(REG=ifelse(state %in% c("TX", "CA", "IL", "NY", "PA", "OH", "NJ", "MD", "MA", "CN", "RI", "NH", "ME", "MI", "DE", "OR"),"Deregulated",
ifelse(state %in% c("AZ","WY","NM","ID","MT","NV","UT","CO","WA","AL","AK","IA","VA","AR","WV","NC","LA","IL","CT","WI","FL","GA","VT","HI","IN","KS","KY","TN","SC","MT","MN","MS","MO","DC","OK","ND","NE","SD"),"Regulated", NA)))
CZLM<-lm(res_sqrt~REG+grid_connection, data = CZ_dereg)
summary(CZLM)
##
## Call:
## lm(formula = res_sqrt ~ REG + grid_connection, data = CZ_dereg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.168599 -0.019335 -0.001341 0.018006 0.236410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.365503 0.004725 77.361 < 2e-16 ***
## REGRegulated -0.035742 0.003050 -11.720 < 2e-16 ***
## grid_connectionAlaska 0.175550 0.014029 12.514 < 2e-16 ***
## grid_connectionEastern 0.027195 0.005500 4.945 8.71e-07 ***
## grid_connectionHawaii 0.330308 0.021085 15.665 < 2e-16 ***
## grid_connectionWestern 0.013898 0.006002 2.316 0.0207 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04064 on 1188 degrees of freedom
## Multiple R-squared: 0.2893, Adjusted R-squared: 0.2863
## F-statistic: 96.73 on 5 and 1188 DF, p-value: < 2.2e-16
raintest(CZLM)
##
## Rainbow test
##
## data: CZLM
## Rain = 1.7779, df1 = 597, df2 = 591, p-value = 1.724e-12
durbinWatsonTest(CZLM)
## lag Autocorrelation D-W Statistic p-value
## 1 0.09743654 1.805011 0
## Alternative hypothesis: rho != 0
#General plot of model + BPTEST
plot(CZLM)
bptest(CZLM)
##
## studentized Breusch-Pagan test
##
## data: CZLM
## BP = 182.18, df = 5, p-value < 2.2e-16
#Here is the QQplot and shap test. We talked about the shap test not being significant but the data passes a visual test of normality.
plot(CZLM, which=2)
shapiro.test(CZ_dereg$res_sqrt)
##
## Shapiro-Wilk normality test
##
## data: CZ_dereg$res_sqrt
## W = 0.8276, p-value < 2.2e-16
#Cant do corellation as only one column is numeric.
vif(CZLM)
## GVIF Df GVIF^(1/(2*Df))
## REG 1.259148 1 1.122117
## grid_connection 1.259148 4 1.029223
#Honestly my model has a lot of problems with it as it stands right now, so I am going to include energy mixture by total electricity generated per state to see what happens when its plugged in.
#Here I added the dataset from the EPA to original csv. This will be notated as GridMix
GridMix<-read.csv("MS.csv")
GridMix$grid_connection <- relevel(factor(GridMix$grid_connection), ref = "Texas")
summary(GridMix)
## X unnamed..0 zip eiaid
## Min. : 0.0 Min. : 1.0 Min. : 1089 Min. : 55
## 1st Qu.: 298.2 1st Qu.: 299.2 1st Qu.:35771 1st Qu.: 5873
## Median : 596.5 Median : 597.5 Median :50024 Median :12204
## Mean : 596.5 Mean : 597.5 Mean :52860 Mean :12261
## 3rd Qu.: 894.8 3rd Qu.: 895.8 3rd Qu.:72747 3rd Qu.:16823
## Max. :1193.0 Max. :1194.0 Max. :99918 Max. :57483
## utility_name state service_type ownership
## Length:1194 Length:1194 Length:1194 Length:1194
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## comm_rate ind_rate res_rate grid_connection
## Min. :0.0000 Min. :0.00000 Min. :0.03065 Texas : 74
## 1st Qu.:0.1051 1st Qu.:0.06744 1st Qu.:0.11692 Alaska : 10
## Median :0.1194 Median :0.08224 Median :0.12857 Eastern:917
## Mean :0.1244 Mean :0.08226 Mean :0.13506 Hawaii : 4
## 3rd Qu.:0.1363 3rd Qu.:0.09802 3rd Qu.:0.14530 Western:189
## Max. :0.4631 Max. :0.40244 Max. :0.53313
## res_sqrt reg nameplate.capacity..mw.
## Min. :0.1751 Length:1194 Length:1194
## 1st Qu.:0.3419 Class :character Class :character
## Median :0.3586 Mode :character Mode :character
## Mean :0.3643
## 3rd Qu.:0.3812
## Max. :0.7302
## net.generation..mwh. coal oil gas
## Length:1194 Min. :0.0000 Min. :0.000000 Min. :0.0000
## Class :character 1st Qu.:0.1000 1st Qu.:0.000000 1st Qu.:0.2120
## Mode :character Median :0.1790 Median :0.001000 Median :0.3800
## Mean :0.2335 Mean :0.007004 Mean :0.3626
## 3rd Qu.:0.3240 3rd Qu.:0.003000 3rd Qu.:0.4790
## Max. :0.8950 Max. :0.715000 Max. :0.8950
## other.fossil nuclear hydro biomass
## Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00400 1st Qu.:0.00300
## Median :0.000000 Median :0.1270 Median :0.02000 Median :0.01100
## Mean :0.004057 Mean :0.1721 Mean :0.07099 Mean :0.01445
## 3rd Qu.:0.005000 3rd Qu.:0.2690 3rd Qu.:0.06600 3rd Qu.:0.01900
## Max. :0.044000 Max. :0.5820 Max. :0.67700 Max. :0.36000
## wind solar geo..thermal
## Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.0060 1st Qu.:0.000000
## Median :0.0300 Median :0.0110 Median :0.000000
## Mean :0.1039 Mean :0.0287 Mean :0.002188
## 3rd Qu.:0.1480 3rd Qu.:0.0410 3rd Qu.:0.000000
## Max. :0.6270 Max. :0.1980 Max. :0.094000
## other.unknown..purchased.fuel totalrenewable total.fossilfuel
## Min. :0.0000000 Min. :0.0070 Min. :0.0030
## 1st Qu.:0.0000000 1st Qu.:0.0750 1st Qu.:0.4740
## Median :0.0000000 Median :0.1210 Median :0.6100
## Mean :0.0007705 Mean :0.2058 Mean :0.6072
## 3rd Qu.:0.0010000 3rd Qu.:0.3040 3rd Qu.:0.7710
## Max. :0.0100000 Max. :0.8130 Max. :0.9720
## totalnuclear biomass.1
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00300
## Median :0.1270 Median :0.01100
## Mean :0.1721 Mean :0.01445
## 3rd Qu.:0.2690 3rd Qu.:0.01900
## Max. :0.5820 Max. :0.36000
GMLM<-lm(res_sqrt~grid_connection+totalrenewable+reg, data = GridMix)
summary(GMLM)
##
## Call:
## lm(formula = res_sqrt ~ grid_connection + totalrenewable + reg,
## data = GridMix)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.162067 -0.019548 -0.002344 0.017211 0.235253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.373841 0.005028 74.348 < 2e-16 ***
## grid_connectionAlaska 0.175143 0.013913 12.588 < 2e-16 ***
## grid_connectionEastern 0.022823 0.005537 4.122 4.03e-05 ***
## grid_connectionHawaii 0.326070 0.020931 15.578 < 2e-16 ***
## grid_connectionWestern 0.019723 0.006087 3.240 0.00123 **
## totalrenewable -0.032194 0.007044 -4.570 5.38e-06 ***
## regRegulated -0.034756 0.003032 -11.462 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04031 on 1187 degrees of freedom
## Multiple R-squared: 0.3016, Adjusted R-squared: 0.2981
## F-statistic: 85.44 on 6 and 1187 DF, p-value: < 2.2e-16
GMLM<-lm(res_sqrt~grid_connection+totalrenewable+reg, data = GridMix)
summary(GMLM)
##
## Call:
## lm(formula = res_sqrt ~ grid_connection + totalrenewable + reg,
## data = GridMix)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.162067 -0.019548 -0.002344 0.017211 0.235253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.373841 0.005028 74.348 < 2e-16 ***
## grid_connectionAlaska 0.175143 0.013913 12.588 < 2e-16 ***
## grid_connectionEastern 0.022823 0.005537 4.122 4.03e-05 ***
## grid_connectionHawaii 0.326070 0.020931 15.578 < 2e-16 ***
## grid_connectionWestern 0.019723 0.006087 3.240 0.00123 **
## totalrenewable -0.032194 0.007044 -4.570 5.38e-06 ***
## regRegulated -0.034756 0.003032 -11.462 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04031 on 1187 degrees of freedom
## Multiple R-squared: 0.3016, Adjusted R-squared: 0.2981
## F-statistic: 85.44 on 6 and 1187 DF, p-value: < 2.2e-16
plot(GMLM)
bptest(GMLM)
##
## studentized Breusch-Pagan test
##
## data: GMLM
## BP = 186.03, df = 6, p-value < 2.2e-16
durbinWatsonTest(GMLM)
## lag Autocorrelation D-W Statistic p-value
## 1 0.3540649 1.290822 0
## Alternative hypothesis: rho != 0
vif(GMLM)
## GVIF Df GVIF^(1/(2*Df))
## grid_connection 1.871424 4 1.081488
## totalrenewable 1.486608 1 1.219266
## reg 1.265563 1 1.124972
shapiro.test(GridMix$res_sqrt)
##
## Shapiro-Wilk normality test
##
## data: GridMix$res_sqrt
## W = 0.8276, p-value < 2.2e-16
raintest(GMLM)
##
## Rainbow test
##
## data: GMLM
## Rain = 2.3105, df1 = 597, df2 = 590, p-value < 2.2e-16