library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

#The dataset I chose is the data collected by the DOE as part of reporting by electric utilities yearly operations. They collect data on its organizational structure, which zip codes it serves and the electricity rates for commerical, residential, and industrial customers.

iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")

non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")

#Here I combined the datasets in order to have a unified dataset for all utilites in the country.

combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)

#The dataset is weird because I really have one set of numbers that I am interested in studying, the residential rates, but to do the cor command they obviously need to be numbers. I will showcase the cor command for commercial, industrial, and residential rates for the homework.Below is the residential against the commerical rates, then the residential versus industrial, then commercial verus industrial. All are strongly correlated as the prices are only offset in specific instances to generally benefit either residential or industrial consumers.

cor(combinedzips$res_rate,combinedzips$comm_rate)
## [1] 0.8889202
cor(combinedzips$res_rate,combinedzips$ind_rate)
## [1] 0.7421116
cor(combinedzips$comm_rate,combinedzips$ind_rate)
## [1] 0.779103
pairs(~combinedzips$res_rate+combinedzips$comm_rate+combinedzips$ind_rate)

#In the pairs data we can see that all are very strongly correlated with each other outside of outliers such as the 0;s that are in there. The 0’s are due to the utility not servicing that consumer base, they did not put in NA’s in the dataset.Unfortunately in my dataset there isnt really a good way to separate out these 0’s for every column as it would delete information in one of the others. So i am going to focus specifically on the residential rates for my study .

CZcleaned <- combinedzips %>% filter(res_rate != 0)

#This is the function to clear the 0’s from the res rates, and now I will do the cor and pairs functions again.

cor(CZcleaned$res_rate,CZcleaned$comm_rate)
## [1] 0.9033948
cor(CZcleaned$res_rate,CZcleaned$ind_rate)
## [1] 0.7425455
cor(CZcleaned$comm_rate,CZcleaned$ind_rate)
## [1] 0.7592853
pairs(~CZcleaned$res_rate+CZcleaned$comm_rate+CZcleaned$ind_rate)