library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")

non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")

#Here I combined the datasets in order to have a unified dataset for all utilites in the country.

combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)

#I am going to clean up some of the excess information here. Each row is almsot identical except for the zip code, so I am going to combine the rows into single columns based around the utility information.

CZ<-combinedzips %>%
  distinct(utility_name, .keep_all = TRUE)

#This is the function to clear the 0’s from the residential, commercial, and industrial rates.

CZres <- CZ %>% filter(res_rate != 0)

#I am specifically interested in how the grid interconnection may affect the overall model so I am going to add in a new column that assigns a grid interconnection value to each state. I will do this for each cleaned dataset.

CZ_res_grids<-CZres %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
      ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
      ifelse(state %in% c("AK"),"Alaska",
      ifelse(state %in% c("HI"), "Hawaii",
      ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_res_grids_sqrt<-CZ_res_grids %>% mutate(res_sqrt=sqrt(res_rate))
CZ_res_grids_sqrt$grid_connection <- relevel(factor(CZ_res_grids_sqrt$grid_connection), ref = "Texas")

#Here I add a new column labeled REG. I looked into it and I wanted to see the difference betweent the states for regulation versus the grid interconnection prices.

CZ_dereg<-CZ_res_grids_sqrt %>% mutate(REG=ifelse(state %in% c("TX", "CA", "IL", "NY", "PA", "OH", "NJ", "MD", "MA", "CN", "RI", "NH", "ME", "MI", "DE", "OR"),"Deregulated",
      ifelse(state %in% c("AZ","WY","NM","ID","MT","NV","UT","CO","WA","AL","AK","IA","VA","AR","WV","NC","LA","IL","CT","WI","FL","GA","VT","HI","IN","KS","KY","TN","SC","MT","MN","MS","MO","DC","OK","ND","NE","SD"),"Regulated", NA)))
CZLM<-lm(res_sqrt~REG+grid_connection, data = CZ_dereg)
summary(CZLM)
## 
## Call:
## lm(formula = res_sqrt ~ REG + grid_connection, data = CZ_dereg)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168599 -0.019335 -0.001341  0.018006  0.236410 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.365503   0.004725  77.361  < 2e-16 ***
## REGRegulated           -0.035742   0.003050 -11.720  < 2e-16 ***
## grid_connectionAlaska   0.175550   0.014029  12.514  < 2e-16 ***
## grid_connectionEastern  0.027195   0.005500   4.945 8.71e-07 ***
## grid_connectionHawaii   0.330308   0.021085  15.665  < 2e-16 ***
## grid_connectionWestern  0.013898   0.006002   2.316   0.0207 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04064 on 1188 degrees of freedom
## Multiple R-squared:  0.2893, Adjusted R-squared:  0.2863 
## F-statistic: 96.73 on 5 and 1188 DF,  p-value: < 2.2e-16
raintest(CZLM)
## 
##  Rainbow test
## 
## data:  CZLM
## Rain = 1.7779, df1 = 597, df2 = 591, p-value = 1.724e-12
durbinWatsonTest(CZLM)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.09743654      1.805011       0
##  Alternative hypothesis: rho != 0

#General plot of model + BPTEST

plot(CZLM)

bptest(CZLM)
## 
##  studentized Breusch-Pagan test
## 
## data:  CZLM
## BP = 182.18, df = 5, p-value < 2.2e-16

#Here is the QQplot and shap test. We talked about the shap test not being significant but the data passes a visual test of normality.

plot(CZLM, which=2)

shapiro.test(CZ_dereg$res_sqrt)
## 
##  Shapiro-Wilk normality test
## 
## data:  CZ_dereg$res_sqrt
## W = 0.8276, p-value < 2.2e-16

#Cant do corellation as only one column is numeric.

vif(CZLM)
##                     GVIF Df GVIF^(1/(2*Df))
## REG             1.259148  1        1.122117
## grid_connection 1.259148  4        1.029223

#Honestly my model has a lot of problems with it as it stands right now, so I am going to include energy mixture by total electricity generated per state to see what happens when its plugged in.

#Here I added the dataset from the EPA to original csv. This will be notated as GridMix

GridMix<-read.csv("MS.csv")
GridMix$grid_connection <- relevel(factor(GridMix$grid_connection), ref = "Texas")
summary(GridMix)
##        X            unnamed..0          zip            eiaid      
##  Min.   :   0.0   Min.   :   1.0   Min.   : 1089   Min.   :   55  
##  1st Qu.: 298.2   1st Qu.: 299.2   1st Qu.:35771   1st Qu.: 5873  
##  Median : 596.5   Median : 597.5   Median :50024   Median :12204  
##  Mean   : 596.5   Mean   : 597.5   Mean   :52860   Mean   :12261  
##  3rd Qu.: 894.8   3rd Qu.: 895.8   3rd Qu.:72747   3rd Qu.:16823  
##  Max.   :1193.0   Max.   :1194.0   Max.   :99918   Max.   :57483  
##  utility_name          state           service_type        ownership        
##  Length:1194        Length:1194        Length:1194        Length:1194       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    comm_rate         ind_rate          res_rate       grid_connection
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.03065   Texas  : 74    
##  1st Qu.:0.1051   1st Qu.:0.06744   1st Qu.:0.11692   Alaska : 10    
##  Median :0.1194   Median :0.08224   Median :0.12857   Eastern:917    
##  Mean   :0.1244   Mean   :0.08226   Mean   :0.13506   Hawaii :  4    
##  3rd Qu.:0.1363   3rd Qu.:0.09802   3rd Qu.:0.14530   Western:189    
##  Max.   :0.4631   Max.   :0.40244   Max.   :0.53313                  
##     res_sqrt          reg            nameplate.capacity..mw.
##  Min.   :0.1751   Length:1194        Length:1194            
##  1st Qu.:0.3419   Class :character   Class :character       
##  Median :0.3586   Mode  :character   Mode  :character       
##  Mean   :0.3643                                             
##  3rd Qu.:0.3812                                             
##  Max.   :0.7302                                             
##  net.generation..mwh.      coal             oil                gas        
##  Length:1194          Min.   :0.0000   Min.   :0.000000   Min.   :0.0000  
##  Class :character     1st Qu.:0.1000   1st Qu.:0.000000   1st Qu.:0.2120  
##  Mode  :character     Median :0.1790   Median :0.001000   Median :0.3800  
##                       Mean   :0.2335   Mean   :0.007004   Mean   :0.3626  
##                       3rd Qu.:0.3240   3rd Qu.:0.003000   3rd Qu.:0.4790  
##                       Max.   :0.8950   Max.   :0.715000   Max.   :0.8950  
##   other.fossil         nuclear           hydro            biomass       
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.00400   1st Qu.:0.00300  
##  Median :0.000000   Median :0.1270   Median :0.02000   Median :0.01100  
##  Mean   :0.004057   Mean   :0.1721   Mean   :0.07099   Mean   :0.01445  
##  3rd Qu.:0.005000   3rd Qu.:0.2690   3rd Qu.:0.06600   3rd Qu.:0.01900  
##  Max.   :0.044000   Max.   :0.5820   Max.   :0.67700   Max.   :0.36000  
##       wind            solar         geo..thermal     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000000  
##  1st Qu.:0.0000   1st Qu.:0.0060   1st Qu.:0.000000  
##  Median :0.0300   Median :0.0110   Median :0.000000  
##  Mean   :0.1039   Mean   :0.0287   Mean   :0.002188  
##  3rd Qu.:0.1480   3rd Qu.:0.0410   3rd Qu.:0.000000  
##  Max.   :0.6270   Max.   :0.1980   Max.   :0.094000  
##  other.unknown..purchased.fuel totalrenewable   total.fossilfuel
##  Min.   :0.0000000             Min.   :0.0070   Min.   :0.0030  
##  1st Qu.:0.0000000             1st Qu.:0.0750   1st Qu.:0.4740  
##  Median :0.0000000             Median :0.1210   Median :0.6100  
##  Mean   :0.0007705             Mean   :0.2058   Mean   :0.6072  
##  3rd Qu.:0.0010000             3rd Qu.:0.3040   3rd Qu.:0.7710  
##  Max.   :0.0100000             Max.   :0.8130   Max.   :0.9720  
##   totalnuclear      biomass.1      
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00300  
##  Median :0.1270   Median :0.01100  
##  Mean   :0.1721   Mean   :0.01445  
##  3rd Qu.:0.2690   3rd Qu.:0.01900  
##  Max.   :0.5820   Max.   :0.36000
GMLM<-lm(res_sqrt~grid_connection+totalrenewable+reg, data = GridMix)
summary(GMLM)
## 
## Call:
## lm(formula = res_sqrt ~ grid_connection + totalrenewable + reg, 
##     data = GridMix)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.162067 -0.019548 -0.002344  0.017211  0.235253 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.373841   0.005028  74.348  < 2e-16 ***
## grid_connectionAlaska   0.175143   0.013913  12.588  < 2e-16 ***
## grid_connectionEastern  0.022823   0.005537   4.122 4.03e-05 ***
## grid_connectionHawaii   0.326070   0.020931  15.578  < 2e-16 ***
## grid_connectionWestern  0.019723   0.006087   3.240  0.00123 ** 
## totalrenewable         -0.032194   0.007044  -4.570 5.38e-06 ***
## regRegulated           -0.034756   0.003032 -11.462  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04031 on 1187 degrees of freedom
## Multiple R-squared:  0.3016, Adjusted R-squared:  0.2981 
## F-statistic: 85.44 on 6 and 1187 DF,  p-value: < 2.2e-16
GMLM<-lm(res_sqrt~grid_connection+totalrenewable+reg, data = GridMix)
summary(GMLM)
## 
## Call:
## lm(formula = res_sqrt ~ grid_connection + totalrenewable + reg, 
##     data = GridMix)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.162067 -0.019548 -0.002344  0.017211  0.235253 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.373841   0.005028  74.348  < 2e-16 ***
## grid_connectionAlaska   0.175143   0.013913  12.588  < 2e-16 ***
## grid_connectionEastern  0.022823   0.005537   4.122 4.03e-05 ***
## grid_connectionHawaii   0.326070   0.020931  15.578  < 2e-16 ***
## grid_connectionWestern  0.019723   0.006087   3.240  0.00123 ** 
## totalrenewable         -0.032194   0.007044  -4.570 5.38e-06 ***
## regRegulated           -0.034756   0.003032 -11.462  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04031 on 1187 degrees of freedom
## Multiple R-squared:  0.3016, Adjusted R-squared:  0.2981 
## F-statistic: 85.44 on 6 and 1187 DF,  p-value: < 2.2e-16
plot(GMLM)

bptest(GMLM)
## 
##  studentized Breusch-Pagan test
## 
## data:  GMLM
## BP = 186.03, df = 6, p-value < 2.2e-16
durbinWatsonTest(GMLM)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3540649      1.290822       0
##  Alternative hypothesis: rho != 0
vif(GMLM)
##                     GVIF Df GVIF^(1/(2*Df))
## grid_connection 1.871424  4        1.081488
## totalrenewable  1.486608  1        1.219266
## reg             1.265563  1        1.124972
shapiro.test(GridMix$res_sqrt)
## 
##  Shapiro-Wilk normality test
## 
## data:  GridMix$res_sqrt
## W = 0.8276, p-value < 2.2e-16
raintest(GMLM)
## 
##  Rainbow test
## 
## data:  GMLM
## Rain = 2.3105, df1 = 597, df2 = 590, p-value < 2.2e-16