Files and data stored at: https://github.com/rweberc/Data607_Project2

Electricity demand (contributed by Rose Koh)

Net actual interchange (electric): The algebraic sum of all metered interchange over all interconnections between two physically Adjacent Balancing Authority Areas. NERC definition https://www.eia.gov/tools/glossary/?id=electricity

Read in data

elecDf <- read.csv("https://raw.githubusercontent.com/rweberc/Data607_Project2/master/electricity.csv", stringsAsFactors = FALSE, colClasses = "character", skip = 4)

Confirm data structure:

  • Every second line “Demand”
  • Every third line “”
  • Every fourth line “Total net actual interchange”
stopifnot(all(elecDf$megawatthours[c(FALSE, TRUE, FALSE, FALSE)] == "Demand"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, TRUE, FALSE)] == "Net generation"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, FALSE, TRUE)] == "Total net actual interchange"))

Extract region names

elecDf$region <- NA
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- elecDf$megawatthours[c(TRUE, FALSE, FALSE, FALSE)]

elecDf <- elecDf %>% fill(region)
table(elecDf$region, useNA = "ifany")
## 
##                           California (region) 
##                                             4 
##                            Carolinas (region) 
##                                             4 
##                              Central (region) 
##                                             4 
##         Electric Reliability Council of Texas 
##                                             4 
##                              Florida (region) 
##                                             4 
##                         Mid-Atlantic (region) 
##                                             4 
##                              Midwest (region) 
##                                             4 
##                      New England ISO (region) 
##                                             4 
## New York Independent System Operator (region) 
##                                             4 
##                            Northwest (region) 
##                                             4 
##                            Southeast (region) 
##                                             4 
##                            Southwest (region) 
##                                             4 
##           Tennessee Valley Authority (region) 
##                                             4 
##               United States Lower 48 (region) 
##                                             4
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- ""

Update column names

elecDf <- elecDf %>% rename(category = megawatthours)
elecDf <- elecDf %>% select(category, region, everything())

colnames(elecDf) <- str_replace(colnames(elecDf), "X", "")
colnames(elecDf) <- str_replace_all(colnames(elecDf), "\\.", "/")

colnames(elecDf)
##  [1] "category"   "region"     "01/01/2018" "01/02/2018" "01/03/2018"
##  [6] "01/04/2018" "01/05/2018" "01/06/2018" "01/07/2018" "01/08/2018"
## [11] "01/09/2018" "01/10/2018" "01/11/2018" "01/12/2018" "01/13/2018"
## [16] "01/14/2018" "01/15/2018" "01/16/2018" "01/17/2018" "01/18/2018"
## [21] "01/19/2018" "01/20/2018" "01/21/2018" "01/22/2018" "01/23/2018"
## [26] "01/24/2018" "01/25/2018" "01/26/2018" "01/27/2018" "01/28/2018"
## [31] "01/29/2018" "01/30/2018" "01/31/2018"

Wide to long

elecDf_long <- elecDf %>% gather("evalDay", "megawatthours", 3:length(.))

Clean data in long form

# data check
elecDf_long$megawatthours[elecDf_long$megawatthours == " Inc. (region)"] <- ""
stopifnot(elecDf_long$megawatthours[str_trim(elecDf_long$region) == ""] == "")

# filter out unneed "region" rows
elecDf_long <- elecDf_long %>% filter(str_trim(region) != "")

# parse day and megawatthours
elecDf_long$evalDay <- mdy(elecDf_long$evalDay)
elecDf_long$megawatthours <- as.numeric(elecDf_long$megawatthours)

Long to wide

elecDf <- elecDf_long %>% spread(category, megawatthours)
rm(elecDf_long)

# data check: confirm no megawatt hours are NA
stopifnot(nrow(elecDf %>% filter_if(is.Date, any_vars(is.na(.)))) == 0)

Suggested analyses

  • daily demand per region
  • daily net generation per region
  • daily total net actual interchange
perRegionAvg <- elecDf %>% 
  group_by(region) %>%
  summarize(demandAvg = mean(Demand),
                                     netGenAvg = mean(`Net generation`),
                                     totInterchgAvg = mean(`Total net actual interchange`))

ggplot(perRegionAvg, aes(x=region, y=totInterchgAvg)) + 
  geom_bar(stat = "identity") +
  labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))

California looks to have the greatest deficits, while the northwest region looks to have the greatest surplus.

  • Any particular days were demand was greater than production?
dayDf <- elecDf %>% 
  ungroup() %>%
  mutate(evalDayOfWeek = wday(evalDay, label = TRUE)) %>%
  group_by(region, evalDayOfWeek) %>%
  summarize(demandAvg = mean(Demand),
                                     netGenAvg = mean(`Net generation`),
                                     totInterchgAvg = mean(`Total net actual interchange`))

ggplot(dayDf, aes(x = region, y = totInterchgAvg)) + 
  geom_bar(aes(fill = evalDayOfWeek), position = "dodge", stat = "identity") +
  labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))

Net values do not seem dependent on days.