library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#The dataset I chose is the data collected by the DOE as part of reporting by electric utilities yearly operations. They collect data on its organizational structure, which zip codes it serves and the electricity rates for commerical, residential, and industrial customers.

iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")

non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")
combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)

#I had a lot of trouble with R finding the original csv’s so i had to make a new project folder specifically for the datasets to read them properly. After doing that I used rbind to combine the datasets as they were separated datasets due to ownserhip which I wanted to collate into one wo get all utilities in the same directory.

summary(combinedzips)
##       zip            eiaid       utility_name          state          
##  Min.   :  501   Min.   :   55   Length:80103       Length:80103      
##  1st Qu.:21646   1st Qu.: 6411   Class :character   Class :character  
##  Median :44431   Median :13573   Mode  :character   Mode  :character  
##  Mean   :46440   Mean   :13799                                        
##  3rd Qu.:68035   3rd Qu.:17066                                        
##  Max.   :99950   Max.   :57483                                        
##  service_type        ownership           comm_rate          ind_rate      
##  Length:80103       Length:80103       Min.   :0.00000   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:0.08766   1st Qu.:0.05821  
##  Mode  :character   Mode  :character   Median :0.11465   Median :0.07821  
##                                        Mean   :0.11367   Mean   :0.08179  
##                                        3rd Qu.:0.13861   3rd Qu.:0.09838  
##                                        Max.   :0.46309   Max.   :0.40244  
##     res_rate     
##  Min.   :0.0000  
##  1st Qu.:0.1101  
##  Median :0.1327  
##  Mean   :0.1331  
##  3rd Qu.:0.1560  
##  Max.   :0.5331
hist(combinedzips$res_rate)

#this shows the histogram of residential rates for customers for all utilities in the country.

ggplot(combinedzips, aes(x=state, y=res_rate)) + geom_boxplot(fill='green')

#this is the box plot for residential rates in the country separated by state, I will need to learn how to separate the states better in the graph, but it is relatively self-explanatory.

cor(combinedzips$ind_rate,combinedzips$res_rate)
## [1] 0.7421116
cor(combinedzips$comm_rate, combinedzips$res_rate)
## [1] 0.8889202

#These two correlations compare the residential rate between the industrial and commercial rates which shows a discrepancy between how closely they follow each other.

#i would like to look into differences between states and ownership types in the future but I will need more practice with R beforehand.