library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#1) From the data you have chosen, select a variable that you are interested in #I am interested in the residential rate differnces between the different grid connections.

iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")

non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")

combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)
res_rate_cleaned<-combinedzips %>% mutate(grid_connection=ifelse(state %in% c("AL", "MI", "IA", "VA", "WV", "AR", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "LA","OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "OK", "MA", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "DC","NH", "NE"),"Eastern",NA_character_),
        ifelse(state %in% c("TX"),"Texas",NA_character_), 
        ifelse(state %in% c("CA", "UT", "WY" ,"NM" ,"ID" ,"OR" ,"MT" ,"NV" ,"CO","AZ", "WA"), "Western",NA_character_),
        ifelse(state %in% c("AK"),"Alaska",NA_character_),
        ifelse(state %in% c("HI"),"Hawaii",NA_character_))

#This is the function you helped me build that seperates that variables into different grid connection columns that I can work with easier and group into seperate alike entitities.

#2) Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it’s measuring.

pastecs::stat.desc(res_rate_cleaned$res_rate)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 8.010300e+04 4.505000e+03 0.000000e+00 0.000000e+00 5.331277e-01 5.331277e-01 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.065908e+04 1.326708e-01 1.330671e-01 2.169713e-04 4.252624e-04 3.770972e-03 
##      std.dev     coef.var 
## 6.140824e-02 4.614832e-01

#The variable that I chose is the residential rates for the utilities in the 5 grid interconnection zones in the US. The residential rates are counted in dollars per killowatt hour. So the range is between $.50/kwh in one Alaskan utility, to $.03/kwh in one utility in Washington. The median price is around $.13/kwh with a standard deviation of $.19/kwh.

#3) Remove NA’s if needed using dplyr:filter (or anything similar)

res_rate_cleaned<-filter(res_rate_cleaned, res_rate_cleaned$res_rate > 0)

#There are no NA’s in the residential rate column, where there is data that skews the measurement is that there are certain utilities that do not charge residential customers at all, which are listed as 0’s in that dataset. I did remove the 0’s for this reason. They are essentially NA’s and are not representative of the average consumer.

#4) Provide a histogram of the variable (as shown in this lesson)

hist(res_rate_cleaned$res_rate, breaks = 10)

#5) transform the variable using the log transformation or square root transformation (whatever is more appropriate) using dplyr::mutate or something similar #6) provide a histogram of the transformed variable

hist(sqrt(res_rate_cleaned$res_rate),breaks=10) 

#In order to normalize the distribution I needed to square root the dataset. I could not get the log of the data as it is written in decimals which would turn the log negative and muddy the data.