library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#1) From the data you have chosen, select a variable that you are interested in #I am interested in the residential rate differnces between the different grid connections.
iou_zipcodes_2022<-read.csv("iou_zipcodes_2022.csv")
non_iou_zipcodes_2022<-read.csv("non_iou_zipcodes_2022.csv")
combinedzips<-rbind(iou_zipcodes_2022,non_iou_zipcodes_2022)
res_rate_cleaned<-combinedzips %>% mutate(grid_connection=ifelse(state %in% c("AL", "MI", "IA", "VA", "WV", "AR", "TN", "NJ", "MD", "ME", "NC", "SC", "NY", "LA","OH", "PA", "IL", "CT", "WI", "DE", "KS", "MO", "OK", "MA", "FL", "GA", "VT", "IN", "KY", "ND", "SD", "MN", "MS", "RI", "DC","NH", "NE"),"Eastern",NA_character_),
ifelse(state %in% c("TX"),"Texas",NA_character_),
ifelse(state %in% c("CA", "UT", "WY" ,"NM" ,"ID" ,"OR" ,"MT" ,"NV" ,"CO","AZ", "WA"), "Western",NA_character_),
ifelse(state %in% c("AK"),"Alaska",NA_character_),
ifelse(state %in% c("HI"),"Hawaii",NA_character_))
#This is the function you helped me build that seperates that variables into different grid connection columns that I can work with easier and group into seperate alike entitities.
#2) Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it’s measuring.
pastecs::stat.desc(res_rate_cleaned$res_rate)
## nbr.val nbr.null nbr.na min max range
## 8.010300e+04 4.505000e+03 0.000000e+00 0.000000e+00 5.331277e-01 5.331277e-01
## sum median mean SE.mean CI.mean.0.95 var
## 1.065908e+04 1.326708e-01 1.330671e-01 2.169713e-04 4.252624e-04 3.770972e-03
## std.dev coef.var
## 6.140824e-02 4.614832e-01
#The variable that I chose is the residential rates for the utilities in the 5 grid interconnection zones in the US. The residential rates are counted in dollars per killowatt hour. So the range is between $.50/kwh in one Alaskan utility, to $.03/kwh in one utility in Washington. The median price is around $.13/kwh with a standard deviation of $.19/kwh.
#3) Remove NA’s if needed using dplyr:filter (or anything similar)
res_rate_cleaned<-filter(res_rate_cleaned, res_rate_cleaned$res_rate > 0)
#There are no NA’s in the residential rate column, where there is data that skews the measurement is that there are certain utilities that do not charge residential customers at all, which are listed as 0’s in that dataset. I did remove the 0’s for this reason. They are essentially NA’s and are not representative of the average consumer.
#4) Provide a histogram of the variable (as shown in this lesson)
hist(res_rate_cleaned$res_rate, breaks = 10)
#5) transform the variable using the log transformation or square root
transformation (whatever is more appropriate) using dplyr::mutate or
something similar #6) provide a histogram of the transformed
variable
hist(sqrt(res_rate_cleaned$res_rate),breaks=10)
#In order to normalize the distribution I needed to square root the
dataset. I could not get the log of the data as it is written in
decimals which would turn the log negative and muddy the data.