Introduction

This analysis uses the Energy Usage Analysis System dataset, which tracks energy consumption across government facilities. It focuses on electricity, natural gas, fuel oil, chilled water, and steam usage from 1973 to 2019, as published by the U.S. General Services Administration (https://catalog.data.gov/dataset/energy-usage-analysis-system ).

After extensive data cleaning and transformation, I focused on the top 12 facilities by total energy usage. Rows with null values, negative or zero energy usage, and extreme outliers (e.g., values exceeding 1e14 kBtu) were removed to ensure meaningful comparisons. All selected facilities have total usage in the trillions of kBtu or less, providing a consistent and interpretable dataset for visualization.

# Imports
library(readxl)
library(tidyverse)
library(dplyr)
library(sqldf)
library(paletteer)
library(scales)
library(RColorBrewer)
library(ggthemes)
library(tidyr)
library(plotly)
library(leaflet)
library(htmltools)
library(patchwork)

setwd("/Users/sarahsmith/Desktop/EUAS/EUAS/data")

# main table with the energy types and usage amounts
df_1 <- read_excel("mar.energyutilization.xlsx")

# drop unneeded columns
cols_to_drop <- c("KWHRASRC", "KWHRDMDASRC", "STEAMASRC", "GASASRC", "OILASRC", 
                  "COALASRC", "KWHRCSRC", "KWDMDCSRC", "STEAMCSRC", "GASCSRC", 
                  "OILCSRC", "COALCSRC", "CHILLWTRCSRC", "CHILLWTRASRC", "REMARKS", 
                  "WATER_REMARKS", "WTRCSRC", "WTRASRC", "REOILCSRC", "REOILASRC", 
                  "REGASCSRC", "REGASASRC", "REELECCSRC", "REELECASRC", "RECWTRCSRC", 
                  "RECWTRASRC", "REELECAMT", "REELECCOST", "REGASAMT", "REGASCOST", 
                  "REOILCOST", "RECWTRAMT", "RECWTRCOST", "REOILAMT")

df_main = df_1[,!(names(df_1) %in% cols_to_drop)]

# condense all cost and amounts into one column
df_main <- df_main %>%
  pivot_longer(
    cols = c(ends_with("AMT"), ends_with("COST")),
    names_to = c("EnergyType", ".value"),
    names_pattern = "(.*)(AMT|COST)"
  )


# rename the energy type from abbreviation to full form
df_main <- df_main %>%
  mutate(
    EnergyType = recode(
      EnergyType,
      "KWHR"  = "Electricity",
      "KWDMD" = "Peak electricity demand (kW)", 
      "STEAM" = "Steam", 
      "GAS" = "Natural gas", 
      "OIL" = "Fuel Oil", 
      "COAL" = "Coal", 
      "CHILLWTR" = "Chilled Water",
      "WTR" = "Water"
    )
  )

# make succinct df of conversion factors
conversion_factors <- data.frame(
  EnergyType = c("Electricity", "Peak electricity demand (kW)", "Steam", "Natural gas", "Fuel Oil", "Coal", "Chilled Water", "Water"),
  factor_to_kBtu = c(
    3412.142,   # kWh
    1,          # kW demand (not an energy unit)
    1000000,    # steam (per Mlb)
    1031,       # natural gas (per cubic foot)
    24580000,   # coal (per ton)
    138700,     # oil (per gallon)
    12000,      # chilled water (per ton-hour)
    1           # water 
  )
)

# apply the conversion factors to usage amounts in both df's
df_main <- df_main %>%
  left_join(conversion_factors, by = "EnergyType") %>%
  mutate(
    Value_kBtu = AMT * factor_to_kBtu
  )

# df containing info about the bldgs - import, clean
facility_buildings <- read_excel("mar.facilitybuildings.xlsx")
fb_cols_to_drop <- c("REGNNUM", "CBECS_CLIMATE_ZONE", "WEATHER_STATION")
facility_buildings = facility_buildings[,!(names(facility_buildings) %in% fb_cols_to_drop)]

# df with building info including name - import, clean
df_names <- read_excel("mar.areafieldoffice.xlsx")
names_cols_to_drop <- c("REGION_ID", "DISTRICT", "STATE")
df_names = df_names[,!names(df_names) %in% names_cols_to_drop]

# df with building info gross sqft
fyr_buildings <- read_excel("mar.fyrbuilding.xlsx")

# join names with buildings
facility_buildings_unique <- facility_buildings %>%
  distinct(AREAFOCD, .keep_all = TRUE)

names_unique <- df_names %>%
  distinct(AREA_FIELD_OFFICE_CD, .keep_all = TRUE)

fac_names <- sqldf("SELECT e.*,
                           f.NAME
                           FROM facility_buildings_unique e
                           LEFT JOIN names_unique f
                           ON e.AREAFOCD = f.AREA_FIELD_OFFICE_CD"
)


unique_fac_names <- fac_names %>%
  group_by(BLDGNUM) %>%
  summarise(across(everything(), first))

merged <- left_join(df_main, unique_fac_names, by = "BLDGNUM")

merged_clean <- merged %>%
  filter(!is.na(EnergyType), !is.na(AMT), !is.na(COST), !is.na(NAME), !is.na(CLASSCODE))

merged_nonzero <- merged_clean %>% filter(AMT > 0)

df_peak_demand_nonzero <- merged_nonzero %>% filter(EnergyType == "Peak electricity demand (kW)")
df_elec_only <- merged_nonzero %>% filter(EnergyType == "Electricity")

energy_nonzero <- merged_nonzero %>% filter(EnergyType != "Peak electricity demand (kW)")

# total usage, across all years and all energy types
facility_totals <- merged_nonzero %>%
  group_by(BLDGNUM) %>%
  summarise(TotalUsage = sum(Value_kBtu, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(TotalUsage))

# sum up energy usage totals per facility
facility_usage_bytype <- merged_nonzero %>%
  group_by(BLDGNUM, FYR, EnergyType) %>%
  summarise(TotalUsage = sum(Value_kBtu, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(TotalUsage))

# sum up energy usage totals per type and facility
facility_type_summed_usage <- merged_nonzero %>%
  group_by(BLDGNUM, EnergyType) %>%
  summarise(TotalUsage = sum(Value_kBtu, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(TotalUsage))

facility_usage_unique_bldgnum <- facility_totals %>%
  distinct(BLDGNUM, .keep_all = TRUE)

# top 12 facilities - create df's and join with the name and other info
top_twelve_bldgnum <- facility_totals$BLDGNUM[2:13] 

top12_usage_bytype <- facility_usage_bytype %>%
  filter(BLDGNUM %in% top_twelve_bldgnum)

top12_type_summed <- facility_type_summed_usage %>%
  filter(BLDGNUM %in% top_twelve_bldgnum)

top12_facility_usage <- facility_totals %>%
  filter(BLDGNUM %in% top_twelve_bldgnum)
  
# top 12 total usage of electricity
top12_electricity <- df_elec_only %>%
  filter(BLDGNUM %in% top_twelve_bldgnum) %>%
  group_by(BLDGNUM, FYR) %>%
  summarise(Total_Electricity = sum(Value_kBtu / 3412.142, na.rm = TRUE)) %>%
  ungroup()

# top 12 total usage of peak demand
top12_peak_demand <- df_peak_demand_nonzero %>%
  filter(BLDGNUM %in% top_twelve_bldgnum) %>%
  group_by(BLDGNUM, FYR) %>%
  summarise(Total_PeakDemand = sum(Value_kBtu, na.rm = TRUE)) %>%
    ungroup()

top12_peak_demand <- top12_peak_demand %>%
  left_join(fac_names, by = "BLDGNUM")

top12_electricity <- top12_electricity %>%
  left_join(fac_names, by = "BLDGNUM")

# merge the top 12 df's of peak demand and electricity
top12_elec_demand_combined <- inner_join(
  top12_electricity,
  top12_peak_demand,
  by = c("BLDGNUM", "FYR")
)

top12_elec_demand_combined <- top12_elec_demand_combined %>%
  left_join(fac_names, by = "BLDGNUM")

top12_facility_usage <- top12_facility_usage %>%
  left_join(fac_names, by = "BLDGNUM")

facility_totals <- facility_totals %>%
  left_join(fac_names, by = "BLDGNUM")

facility_usage_bytype <- facility_usage_bytype %>%
  left_join(fac_names, by = "BLDGNUM")

top12_type_summed <- top12_type_summed %>%
  left_join(fac_names, by = "BLDGNUM")

top12_usage_bytype <- top12_usage_bytype %>%
  left_join(fac_names, by = "BLDGNUM")

#Energy Usage Intensity
top12_EUI <- top12_facility_usage %>%
  group_by(BLDGNUM) %>%
  mutate(Total_Use = sum(TotalUsage), Total_Sqft = sum(GROSSSQFT), EUI = Total_Use / Total_Sqft, .groups = 'keep') %>%
  filter(Total_Sqft != 0) %>%
  data.frame()

#Energy Usage Intensity
top12_EUI_bytype <- top12_usage_bytype %>%
  group_by(BLDGNUM) %>%
  mutate(Total_Use = sum(TotalUsage), Total_Sqft = sum(GROSSSQFT), EUI = Total_Use / Total_Sqft, .groups = 'keep') %>%
  filter(Total_Sqft != 0) %>%
  data.frame()

top12_type_summed_EUI <- top12_type_summed %>%
  group_by(BLDGNUM, EnergyType) %>%
  mutate(Total_Use = sum(TotalUsage), Total_Sqft = sum(GROSSSQFT), EUI = Total_Use / Total_Sqft, .groups = 'keep') %>%
  filter(Total_Sqft != 0) %>%
  data.frame()

# create decade columns
top12_usage_bytype <- top12_usage_bytype %>%
  mutate(Decade = paste0(floor(FYR / 10) * 10, "s"))

# top 12 total usage by energy type
top12_total_type_usage <- top12_usage_bytype %>%
  group_by(EnergyType) %>%
  summarise(TotalTypeUsage = sum(TotalUsage, na.rm = TRUE)) %>%
  ungroup()

# add decade columns
top12_electricity <- top12_electricity %>%
  mutate(Decade = paste0(floor(FYR / 10) * 10, "s"))

top12_elec_demand_combined <- top12_elec_demand_combined %>%
  mutate(Decade = paste0(floor(FYR / 10) * 10, "s"))

top12_type_usage_decade <- top12_usage_bytype %>%
  group_by(Decade, NAME, EnergyType) %>%
  summarise(TotalUsage = sum(TotalUsage, na.rm = TRUE)) %>%
  ungroup()

top12_usage_by_decade <- top12_usage_bytype %>%
  group_by(Decade, NAME) %>%
  summarise(TotalUsage = sum(TotalUsage, na.rm = TRUE)) %>%
  ungroup()

# import longitude and latitude info for a map
map_info <- read_csv("MapDetails.csv")
one_col <- c("STATE")
map_info = map_info[,!(names(map_info) %in% one_col)]

# join this to another df
top12_EUI <- top12_EUI %>%
  left_join(map_info, by = "CITY")

Energy Visualization Tabs

Facility Locations and Size

This map highlights the city and state of each facility. The size of each circle corresponds to the facility’s gross square footage, with larger circles representing bigger facilities.

top12_EUI$LONGITUDE <- -abs(top12_EUI$LONGITUDE)
my_colors <- as.character(paletteer_d("palettetown::smoochum"))
pal <- colorFactor(
  palette = my_colors,
  domain = top12_EUI$NAME
)

m <- leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    lng = top12_EUI$LONGITUDE,
    lat = top12_EUI$LATITUDE,
    radius = sqrt(top12_EUI$GROSSSQFT) / 50,
    color = pal(top12_EUI$NAME),
    fillOpacity = 0.8,
    label = lapply(
      seq_len(nrow(top12_EUI)),
      function(i) {
        HTML(paste0(
          "<b>", top12_EUI$NAME[i], "</b><br>",
          "Gross Sqft: ", format(top12_EUI$GROSSSQFT[i], big.mark = ",")
        ))
      }
    ),
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "13px",
      direction = "auto"
    )
  ) %>%
  addLegend(
    "bottomleft",
    pal = pal,
    values = top12_EUI$NAME,
    title = "Facility"
  )

# display the map
m

Total Energy Usage by Facility

The total usage of each energy source from 1973 to 2019 was calculated for each facility, and the facilities were then ordered from highest to lowest total energy consumption.

palette_colors <- paletteer::paletteer_d("palettetown::smoochum")
selected_color_x <- palette_colors[c(5)]
selected_color_y <- palette_colors[c(12)]
top12_facility_usage$NAME <- tools::toTitleCase(tolower(top12_facility_usage$NAME))

ggplot(top12_facility_usage, aes(x = reorder(NAME, TotalUsage), y = TotalUsage / 1e12)) +
  geom_segment(aes(x = NAME, xend = NAME, y = 0, yend = TotalUsage / 1e12), color = selected_color_x) +
  geom_point(color = selected_color_y, size = 4, alpha = 0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(title = "Total Energy Usage by Facility ", y = "Total Usage (T kBtu)", x = "")

Usage by Energy Type

This stacked bar chart displays the energy mix of each facility, ordered from highest to lowest total energy consumption. Hover over the colored segments to see exact values for each energy source.

energy_levels <- c("Chilled Water", "Electricity", "Fuel Oil", "Natural gas", "Steam", "Water") # all your levels in desired order
top12_type_summed <- top12_type_summed %>%
  mutate(EnergyType = factor(EnergyType, levels = (energy_levels)))

facility_order <- top12_type_summed %>%
  group_by(NAME) %>%
  summarise(Total = sum(TotalUsage)) %>%
  arrange(desc(Total)) %>%   # ascending for horizontal bars with reversed axis
  pull(NAME)

top12_type_summed <- top12_type_summed %>%
  mutate(NAME = factor(NAME, levels = rev(facility_order)))

palette_colors <- paletteer::paletteer_d("palettetown::smoochum")
selected_colors <- as.character(palette_colors[c(3, 5, 11, 7, 2, 9)])
selected_colors <- rev(selected_colors)
top12_type_summed$NAME <- tools::toTitleCase(tolower(top12_type_summed$NAME))

plot_ly(top12_type_summed,
        x = ~TotalUsage,
        y = ~NAME,
        color = ~EnergyType,
        colors = selected_colors,
        type = 'bar',
        orientation = 'h',
        hovertemplate = paste(
          "Energy Type: %{customdata}<br>",
          "Usage: %{x} kBtu<br>",
          "<extra></extra>"
        ),
        customdata = top12_type_summed$EnergyType
) %>%
  layout(
    barmode = 'stack',
    title = "Total Usage by Facility by Energy Type",
    xaxis = list(title = "Total Usage"),
    yaxis = list(title = "Facility")
  )

Proportion of Facility Usage

This first pie chart compares the total energy consumption of each facility, showing which buildings use the most energy.

top12_facility_usage$NAME <- tools::toTitleCase(tolower(top12_facility_usage$NAME))

plot_ly(top12_facility_usage, labels = ~NAME, values = ~TotalUsage,
        type = "pie", textposition = "outside",
        textinfo="label + percent", marker = list(colors = palette_colors)) %>%
  layout(title = "Top 12 - Facility Usage")

Proportion of Energy Type

This pie chart breaks down the overall energy consumption by type, illustrating the proportion of different energy sources used across all facilities.

plot_ly(top12_total_type_usage, labels = ~EnergyType, values = ~TotalTypeUsage,
        type = "pie", textposition = "outside",
        textinfo="label + percent", marker = list(colors = palette_colors)) %>%
  layout(title = "Top 12 - Energy Sources")

Electricity Usage

This is a stacked area chart showing how much electricity each facility used from 1973 - 2019.

top12_electricity$NAME <- tools::toTitleCase(tolower(top12_electricity$NAME))
ggplot(top12_electricity, aes(x = FYR, y = Total_Electricity, fill = NAME)) +
  geom_area() +
  labs(title = "Stacked Electricity Usage Across Facilities",
       x = "Year", y = "Total Electricity (kWh)") +
  theme_clean() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(values = palette_colors) +
  scale_x_continuous(breaks = seq(min(top12_electricity$FYR), max(top12_electricity$FYR), by = 5), minor_breaks = seq(min(top12_electricity$FYR), max(top12_electricity$FYR), by = 1), labels = scales::label_number(accuracy = 1, big.mark = "")) +
  scale_y_continuous(
    breaks = pretty_breaks(n = 12),
    labels = comma
  )

Energy Efficiency

This scatterplot illustrates the energy efficiency of each facility. Energy Usage Intensity (EUI) is calculated by dividing a building’s annual total energy consumption (in kBtu) by its gross square footage. A lower EUI indicates that the building is using energy more efficiently.

top12_EUI %>%
  arrange(desc(EUI)) %>%
  mutate(NAME = factor(NAME, levels = unique(NAME))) %>%
  ggplot(aes(x = TotalUsage,
             y = EUI,
             size = GROSSSQFT,
             fill = NAME)) +
  geom_point(alpha = 1.0, shape = 21, color = "grey30") +
  scale_x_continuous(
    labels = label_number(scale = 1e-12, suffix = " T"),
    name = "Total Usage (k Btu, trillions)"
  ) +
  scale_y_continuous(
    labels = label_number(scale = 1e-6, suffix = " M"),
    name = "EUI (millions)"
  ) +
  scale_size(
    range = c(5, 24), 
    name = "Gross Sqft", 
    labels = scales::label_comma()
  ) +
  scale_fill_paletteer_d("palettetown::smoochum") +
  guides(
    fill = guide_legend(
      override.aes = list(shape = 21, size = 5, color = "grey30"), 
      byrow = FALSE
    ),
    size = guide_legend(override.aes = list(alpha = 0.8))
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "right",
    legend.box = "vertical",
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5), 
    plot.subtitle = element_text(face = "bold", hjust = 0.5)
  ) +
  labs(
    title = "EUI vs Total Usage, Scaled by Facility Size",
    subtitle = "Top 12 facilities by Energy Use Intensity (EUI)",
    x = "Total Usage (trillions)",
    y = "EUI (millions)"
  )

Conclusion

My analysis and these visualizations provide a broad overview of the energy consumption patterns across a subset of government buildings. One notable finding is that fuel oil represents the largest energy source. It also appears that larger buildings tend to exhibit greater energy efficiency compared to smaller ones; a logical next step would be to compare the EUI of buildings within specific categories. While I would have liked to explore renewable energy usage and visualize trends over time, missing data limited this analysis. Nevertheless, the dataset and these visualizations offer a clear and informative summary of energy consumption for these buildings.