library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")

non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")

#Here I combined the datasets in order to have a unified dataset for all utilites in the country.

combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)

#I am going to clean up some of the excess information here. Each row is almsot identical except for the zip code, so I am going to combine the rows into single columns based around the utility information.

CZ<-combinedzips %>%
  distinct(utility_name, .keep_all = TRUE)

#This is the function to clear the 0’s from the residential, commercial, and industrial rates.

CZres <- CZ %>% filter(res_rate != 0)

#I am specifically interested in how the grid interconnection may affect the overall model so I am going to add in a new column that assigns a grid interconnection value to each state. I will do this for each cleaned dataset.

CZ_res_grids<-CZres %>% mutate(grid_connection=ifelse(state %in% c("AL", "AR", "MA", "OK", "MI", "VA", "WV", "IA", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "NH", "NE", "LA","DC"),"Eastern",
      ifelse(state %in% c("AZ","WY","NM","ID","OR","MT","NV","CA","UT","CO","WA"),"Western",
      ifelse(state %in% c("AK"),"Alaska",
      ifelse(state %in% c("HI"), "Hawaii",
      ifelse(state %in% c("TX"),"Texas",NA))))))
CZ_res_grids_sqrt<-CZ_res_grids %>% mutate(res_sqrt=sqrt(res_rate))
CZ_res_grids_sqrt$grid_connection <- relevel(factor(CZ_res_grids_sqrt$grid_connection), ref = "Texas")

#Here I add a new column labeled REG. I looked into it and I wanted to see the difference betweent the states for regulation versus the grid interconnection prices.

CZ_dereg<-CZ_res_grids_sqrt %>% mutate(REG=ifelse(state %in% c("TX", "CA", "IL", "NY", "PA", "OH", "NJ", "MD", "MA", "CN", "RI", "NH", "ME", "MI", "DE", "OR"),"Deregulated",
      ifelse(state %in% c("AZ","WY","NM","ID","MT","NV","UT","CO","WA","AL","AK","IA","VA","AR","WV","NC","LA","IL","CT","WI","FL","GA","VT","HI","IN","KS","KY","TN","SC","MT","MN","MS","MO","DC","OK","ND","NE","SD"),"Regulated", NA)))
CZLM<-lm(res_sqrt~REG+grid_connection, data = CZ_dereg)
summary(CZLM)
## 
## Call:
## lm(formula = res_sqrt ~ REG + grid_connection, data = CZ_dereg)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168599 -0.019335 -0.001341  0.018006  0.236410 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.365503   0.004725  77.361  < 2e-16 ***
## REGRegulated           -0.035742   0.003050 -11.720  < 2e-16 ***
## grid_connectionAlaska   0.175550   0.014029  12.514  < 2e-16 ***
## grid_connectionEastern  0.027195   0.005500   4.945 8.71e-07 ***
## grid_connectionHawaii   0.330308   0.021085  15.665  < 2e-16 ***
## grid_connectionWestern  0.013898   0.006002   2.316   0.0207 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04064 on 1188 degrees of freedom
## Multiple R-squared:  0.2893, Adjusted R-squared:  0.2863 
## F-statistic: 96.73 on 5 and 1188 DF,  p-value: < 2.2e-16
plot(CZLM)

raintest(CZLM)
## 
##  Rainbow test
## 
## data:  CZLM
## Rain = 1.7779, df1 = 597, df2 = 591, p-value = 1.724e-12
durbinWatsonTest(CZLM)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.09743654      1.805011       0
##  Alternative hypothesis: rho != 0

#General plot of model + BPTEST

plot(CZLM)

bptest(CZLM)
## 
##  studentized Breusch-Pagan test
## 
## data:  CZLM
## BP = 182.18, df = 5, p-value < 2.2e-16

#Here is the QQplot and shap test. We talked about the shap test not being significant but the data passes a visual test of normality.

plot(CZLM, which=2)

shapiro.test(CZ_dereg$res_sqrt)
## 
##  Shapiro-Wilk normality test
## 
## data:  CZ_dereg$res_sqrt
## W = 0.8276, p-value < 2.2e-16

#Cant do corellation as only one column is numeric.

vif(CZLM)
##                     GVIF Df GVIF^(1/(2*Df))
## REG             1.259148  1        1.122117
## grid_connection 1.259148  4        1.029223
CZGLM<-glm(res_rate~grid_connection + REG,data = CZ_dereg,family = Gamma(link=log))
summary(CZGLM)
## 
## Call:
## glm(formula = res_rate ~ grid_connection + REG, family = Gamma(link = log), 
##     data = CZ_dereg)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -2.00748    0.02569 -78.131  < 2e-16 ***
## grid_connectionAlaska   0.90572    0.07629  11.872  < 2e-16 ***
## grid_connectionEastern  0.16325    0.02991   5.458 5.85e-08 ***
## grid_connectionHawaii   1.38864    0.11467  12.110  < 2e-16 ***
## grid_connectionWestern  0.10259    0.03264   3.143  0.00171 ** 
## REGRegulated           -0.21076    0.01659 -12.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.04885249)
## 
##     Null deviance: 78.672  on 1193  degrees of freedom
## Residual deviance: 55.419  on 1188  degrees of freedom
## AIC: -5113.6
## 
## Number of Fisher Scoring iterations: 4
AIC(CZLM,CZGLM)
##       df       AIC
## CZLM   7 -4252.196
## CZGLM  7 -5113.636

does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.

- Yes my model did not meet the shapiro test assumptions for normality, failed the durbin watson autocorrelation of residuals, and the rainbow test. Looking at the polot graphs of the linear model it looks like my data is situated in such a way that there are vertical lines in the graphs which indicate some sort of pattern that the linear model is not picking up and means theres grouping of variables which throw it off.

#6) What would you do to mitigate this assumption? Show your work.

- We talked about how the test for normality in the shapiro-wilkes test was “fixed” through the process of square rooting the dependent variable, but these other tests for assumptions really through it off in those linear models. Doing a generalized linear model looks to have helped as the AIC is lower than the regular linear model. I still have doubts about the overall model as I believe it ultimately needs additional information to be more representative, but this is the information I have in this dataset.