In this workbook, we’ll explore the communities, economies, and natural disaster environments where power plants are located using logistic regression and clustering. Power plants generate electricity and are crucial for homes, businesses, and industries. Disasters like hurricanes, earthquakes, floods, and wildfires can threaten power supplies (source). Understanding the distribution of power plants and the characteristics of their surrounding communities is essential.

Packages

library(tidycensus)
library(tidyverse)
library(janitor)
library(sf)
library(tigris)
library(rio)
library(mapview)
library(stats)
library(highcharter)
library(tidymodels)
library(tidytext)

Load Data

U.S. Census

# load variables for the 2022 ACS 5-year estimates
vars <- load_variables(2022, "acs5", cache = TRUE)
# fetching data
data <- get_acs(
  state = "TX",
  geography = "county",
  variables = c(
    # poverty
    pov_belowlvl = "B17020_002",
    pov_abovelnl = "B17020_010",
    # race
    hisplat = "B03002_012",
    white = "B03002_003",
    black = "B03002_004",
    aian = "B03002_005",
    asian = "B03002_006",
    nhpi = "B03002_007",
    other = "B03002_008",
    multirace = "B03002_009",
    # education
    tot_ed = "B15003_001",
    diploma = "B15003_017",
    ged = "B15003_018",
    some_college_less1 = "B15003_019",
    some_college = "B15003_020",
    associates = "B15003_021",
    bachelors = "B15003_022",
    master = "B15003_023",
    professional = "B15003_024",
    doctoral = "B15003_025",
    # children
    under5men = "B01001_003",
    under5women = "B01001_027",
    # income
    medinc = "B19013_001",
    # homes
    tothome = "B25003_001",
    ownhome = 'B25003_002',
    renthome = 'B25003_003'
  ),
  geometry = TRUE,
  year = 2022
)
## Getting data from the 2018-2022 5-year ACS
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
# view data
head(data)
county_acs22 <- data %>%
  clean_names() %>%
  select(geoid, name, variable, estimate, geometry) %>%
  pivot_wider(names_from = 'variable', values_from = 'estimate') %>%
  # clean county names
  mutate(name = str_trim(str_remove(name, "County, Texas"))) %>% 
  # calculate new variables
  mutate(
    # poverty rate
    povpct = pov_belowlvl / (pov_belowlvl + pov_abovelnl),
    # percent poc
    pocpct = 1 - (
      white / (hisplat + white + black + aian + asian + nhpi + other + multirace)
    ),
    # percent without high school education
    nohspct = 1 - (
      (
        diploma + ged + some_college_less1 + some_college + associates + bachelors + master + professional + doctoral
      ) / tot_ed
    ),
    # number of children under 5
    under5children = under5men + under5women,
    # percent rent
    rentpct = renthome / tothome
    
  )

head(county_acs22)
## Simple feature collection with 6 features and 33 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -105.998 ymin: 27.20931 xmax: -94.9085 ymax: 36.0555
## Geodetic CRS:  NAD83
## # A tibble: 6 × 34
##   geoid name                        geometry under5men under5women  white  black
##   <chr> <chr>             <MULTIPOLYGON [°]>     <dbl>       <dbl>  <dbl>  <dbl>
## 1 48273 Kleberg  (((-97.3178 27.49456, -97.…      1123        1052 6.11e3    886
## 2 48391 Refugio  (((-97.54085 28.16496, -97…       170         207 2.69e3    433
## 3 48201 Harris   (((-94.97839 29.68365, -94…    169281      163544 1.31e6 872237
## 4 48443 Terrell  (((-102.5669 30.28327, -10…         5           6 4.57e2      0
## 5 48229 Hudspeth (((-105.998 32.00233, -105…        17         108 5.45e2     24
## 6 48205 Hartley  (((-103.0422 35.82522, -10…       125         167 3.22e3    396
## # ℹ 27 more variables: aian <dbl>, asian <dbl>, nhpi <dbl>, other <dbl>,
## #   multirace <dbl>, hisplat <dbl>, tot_ed <dbl>, diploma <dbl>, ged <dbl>,
## #   some_college_less1 <dbl>, some_college <dbl>, associates <dbl>,
## #   bachelors <dbl>, master <dbl>, professional <dbl>, doctoral <dbl>,
## #   pov_belowlvl <dbl>, pov_abovelnl <dbl>, medinc <dbl>, tothome <dbl>,
## #   ownhome <dbl>, renthome <dbl>, povpct <dbl>, pocpct <dbl>, nohspct <dbl>,
## #   under5children <dbl>, rentpct <dbl>

Power plants

powerplants = import("global_power_plant_database_v_1_3/global_power_plant_database.csv")

glimpse(powerplants)
## Rows: 34,936
## Columns: 36
## $ country                        <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG…
## $ country_long                   <chr> "Afghanistan", "Afghanistan", "Afghanis…
## $ name                           <chr> "Kajaki Hydroelectric Power Plant Afgha…
## $ gppd_idnr                      <chr> "GEODB0040538", "WKS0070144", "WKS00711…
## $ capacity_mw                    <dbl> 33.00, 10.00, 10.00, 66.00, 100.00, 11.…
## $ latitude                       <dbl> 32.3220, 31.6700, 31.6230, 34.5560, 34.…
## $ longitude                      <dbl> 65.1190, 65.7950, 65.7920, 69.4787, 69.…
## $ primary_fuel                   <chr> "Hydro", "Solar", "Solar", "Hydro", "Hy…
## $ other_fuel1                    <chr> "", "", "", "", "", "", "", "", "", "",…
## $ other_fuel2                    <chr> "", "", "", "", "", "", "", "", "", "",…
## $ other_fuel3                    <chr> "", "", "", "", "", "", "", "", "", "",…
## $ commissioning_year             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 196…
## $ owner                          <chr> "", "", "", "", "", "", "", "", "", "",…
## $ source                         <chr> "GEODB", "Wiki-Solar", "Wiki-Solar", "G…
## $ url                            <chr> "http://globalenergyobservatory.org", "…
## $ geolocation_source             <chr> "GEODB", "Wiki-Solar", "Wiki-Solar", "G…
## $ wepp_id                        <chr> "1009793", "", "", "1009795", "1009797"…
## $ year_of_capacity_data          <int> 2017, NA, NA, 2017, 2017, 2017, 2017, 2…
## $ generation_gwh_2013            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2014            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2015            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2016            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2017            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2018            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_gwh_2019            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ generation_data_source         <chr> "", "", "", "", "", "", "", "", "", "",…
## $ estimated_generation_gwh_2013  <dbl> 123.77, 18.43, 18.64, 225.06, 406.16, 5…
## $ estimated_generation_gwh_2014  <dbl> 162.90, 17.48, 17.58, 203.55, 357.22, 5…
## $ estimated_generation_gwh_2015  <dbl> 97.39, 18.25, 19.10, 146.90, 270.99, 42…
## $ estimated_generation_gwh_2016  <dbl> 137.76, 17.70, 17.62, 230.18, 395.38, 5…
## $ estimated_generation_gwh_2017  <dbl> 119.50, 18.29, 18.72, 174.91, 350.80, 4…
## $ estimated_generation_note_2013 <chr> "HYDRO-V1", "SOLAR-V1-NO-AGE", "SOLAR-V…
## $ estimated_generation_note_2014 <chr> "HYDRO-V1", "SOLAR-V1-NO-AGE", "SOLAR-V…
## $ estimated_generation_note_2015 <chr> "HYDRO-V1", "SOLAR-V1-NO-AGE", "SOLAR-V…
## $ estimated_generation_note_2016 <chr> "HYDRO-V1", "SOLAR-V1-NO-AGE", "SOLAR-V…
## $ estimated_generation_note_2017 <chr> "HYDRO-V1", "SOLAR-V1-NO-AGE", "SOLAR-V…
uspowerplants <- powerplants %>% 
  filter(country == "USA") %>% 
  select(name, latitude, longitude, primary_fuel, owner) 
# download the Texas counties shapefile
texas_counties <- counties(state = "TX", cb = TRUE)
## Retrieving data for the year 2022
# transform to an sf object
texas_sf <- st_as_sf(texas_counties)

st_crs(texas_sf)
## Coordinate Reference System:
##   User input: NAD83 
##   wkt:
## GEOGCRS["NAD83",
##     DATUM["North American Datum 1983",
##         ELLIPSOID["GRS 1980",6378137,298.257222101,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4269]]
# transform powerplants to sf
uspowerplants_sf <- uspowerplants %>%
  st_as_sf(coords = c("longitude", "latitude")) %>%
  st_set_crs(4269)

# check
head(uspowerplants_sf)
## Simple feature collection with 6 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -118.2414 ymin: 33.7943 xmax: -71.4227 ymax: 42.0761
## Geodetic CRS:  NAD83
##                              name primary_fuel                      owner
## 1      100 Brook Hill Drive Solar        Solar         Diamond Properties
## 2       1025 Traveller Solar  LLC        Solar CI-II Mitchell Holding LLC
## 3 1047 Little Mountain Solar  LLC        Solar CI-II Mitchell Holding LLC
## 4          12 Applegate Solar LLC        Solar   G&S Solar Installers LLC
## 5             126 Grove Solar LLC        Solar        126 Grove Solar LLC
## 6                 1420 Coil Av #C        Solar            Konoike Pacific
##                    geometry
## 1   POINT (-73.9828 41.093)
## 2  POINT (-79.1263 35.4273)
## 3  POINT (-80.8067 36.1971)
## 4  POINT (-74.5761 40.2003)
## 5  POINT (-71.4227 42.0761)
## 6 POINT (-118.2414 33.7943)

Disaster declarations

txdisaster <- import("txdisaster14_24.csv")

head(txdisaster)
##   designated_area total_declarations
## 1          Harris                 13
## 2          Walker                 13
## 3          Jasper                 12
## 4         Liberty                 12
## 5         Bastrop                 11
## 6          Grimes                 11

GDP

txgdp <- import("txgdp_county.csv")

head(txgdp)
##     county    gdp2022
## 1    Texas 1924007472
## 2 Anderson    2080081
## 3  Andrews    2703487
## 4 Angelina    3322589
## 5  Aransas     731431
## 6   Archer     283399

Spatial join: I want to connect uspowerplants with the county, remove NA for those that are not in Texas.

txpwrplants_sf <- uspowerplants_sf %>%
  st_join(texas_sf) %>%
  clean_names() %>%
  filter(!is.na(name_2))

Powerplants by county

county_pwrplants <- txpwrplants_sf %>%
  group_by(name_2) %>%
  summarise(tot_pwrplants = n()) %>%
  arrange(desc(tot_pwrplants))

county_pwrplants
## Simple feature collection with 156 features and 2 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -106.5014 ymin: 25.9131 xmax: -93.5659 ymax: 36.4985
## Geodetic CRS:  NAD83
## # A tibble: 156 × 3
##    name_2    tot_pwrplants                                              geometry
##    <chr>             <int>                                      <MULTIPOINT [°]>
##  1 Harris               26 ((-94.9197 29.8247), (-94.9973 29.7535), (-95.0096 2…
##  2 Bexar                20 ((-98.6692 29.2219), (-98.5761 29.3525), (-98.66 29.…
##  3 Pecos                14 ((-102.5058 30.7726), (-102.4141 30.9514), (-102.268…
##  4 Travis               12 ((-97.6129 30.2098), (-97.7088 30.2199), (-97.7844 3…
##  5 Carson               10 ((-101.1866 35.2542), (-101.3108 35.2383), (-101.434…
##  6 Nolan                10 ((-100.5797 32.4969), (-100.3389 32.3606), (-100.370…
##  7 Nueces               10 ((-97.8228 27.5697), (-97.4192 27.8194), (-97.4283 2…
##  8 Brazoria              8 ((-95.4075 28.9913), (-95.3954 28.9888), (-95.394 29…
##  9 El Paso               8 ((-106.2119 31.8239), (-106.2189 31.8131), (-106.372…
## 10 Jefferson             8 ((-93.8913 29.9649), (-93.8836 29.9543), (-93.9523 2…
## # ℹ 146 more rows

join census county data with county of power plants

county_pwrplants %>%
  as_tibble() %>%
  rename(name = name_2)
## # A tibble: 156 × 3
##    name      tot_pwrplants                                              geometry
##    <chr>             <int>                                      <MULTIPOINT [°]>
##  1 Harris               26 ((-94.9197 29.8247), (-94.9973 29.7535), (-95.0096 2…
##  2 Bexar                20 ((-98.6692 29.2219), (-98.5761 29.3525), (-98.66 29.…
##  3 Pecos                14 ((-102.5058 30.7726), (-102.4141 30.9514), (-102.268…
##  4 Travis               12 ((-97.6129 30.2098), (-97.7088 30.2199), (-97.7844 3…
##  5 Carson               10 ((-101.1866 35.2542), (-101.3108 35.2383), (-101.434…
##  6 Nolan                10 ((-100.5797 32.4969), (-100.3389 32.3606), (-100.370…
##  7 Nueces               10 ((-97.8228 27.5697), (-97.4192 27.8194), (-97.4283 2…
##  8 Brazoria              8 ((-95.4075 28.9913), (-95.3954 28.9888), (-95.394 29…
##  9 El Paso               8 ((-106.2119 31.8239), (-106.2189 31.8131), (-106.372…
## 10 Jefferson             8 ((-93.8913 29.9649), (-93.8836 29.9543), (-93.9523 2…
## # ℹ 146 more rows
# combine
texas_combined <- texas_sf %>%
  as_tibble() %>%
  clean_names() %>%
  # spatial data join
  left_join(county_pwrplants %>%
              as_tibble() %>%
              rename(name = name_2),
            by = c('name')) %>%
  
  arrange(desc(tot_pwrplants)) %>%
  mutate(pwrplant_in_county = ifelse(is.na(tot_pwrplants), 0, 1)) %>%
  # acs data join
  left_join(county_acs22, by = 'name') %>%
  mutate(
    maj_poc = ifelse(pocpct > 0.5, 'poc', 'white'),
    maj_rent = ifelse(rentpct > 0.5, 'rent', 'own')
  )


texas_combined
## # A tibble: 254 × 51
##    statefp countyfp countyns affgeoid   geoid.x name  namelsad stusps state_name
##    <chr>   <chr>    <chr>    <chr>      <chr>   <chr> <chr>    <chr>  <chr>     
##  1 48      201      01383886 0500000US… 48201   Harr… Harris … TX     Texas     
##  2 48      029      01383800 0500000US… 48029   Bexar Bexar C… TX     Texas     
##  3 48      371      01383971 0500000US… 48371   Pecos Pecos C… TX     Texas     
##  4 48      453      01384012 0500000US… 48453   Trav… Travis … TX     Texas     
##  5 48      355      01383963 0500000US… 48355   Nuec… Nueces … TX     Texas     
##  6 48      353      01383962 0500000US… 48353   Nolan Nolan C… TX     Texas     
##  7 48      065      01383818 0500000US… 48065   Cars… Carson … TX     Texas     
##  8 48      141      01383855 0500000US… 48141   El P… El Paso… TX     Texas     
##  9 48      245      01383908 0500000US… 48245   Jeff… Jeffers… TX     Texas     
## 10 48      039      01383805 0500000US… 48039   Braz… Brazori… TX     Texas     
## # ℹ 244 more rows
## # ℹ 42 more variables: lsad <chr>, aland <dbl>, awater <dbl>,
## #   geometry.x <MULTIPOLYGON [°]>, tot_pwrplants <int>,
## #   geometry.y <MULTIPOINT [°]>, pwrplant_in_county <dbl>, geoid.y <chr>,
## #   geometry <MULTIPOLYGON [°]>, under5men <dbl>, under5women <dbl>,
## #   white <dbl>, black <dbl>, aian <dbl>, asian <dbl>, nhpi <dbl>, other <dbl>,
## #   multirace <dbl>, hisplat <dbl>, tot_ed <dbl>, diploma <dbl>, ged <dbl>, …

Merge GDP and disaster declarations

texas_combined <- texas_combined %>% 
  # gdp
  left_join(txgdp %>% rename(name = county), by = 'name') %>% 
  # disaster declarations
  left_join(txdisaster %>% rename(name = designated_area), by = 'name')

texas_combined
## # A tibble: 254 × 53
##    statefp countyfp countyns affgeoid   geoid.x name  namelsad stusps state_name
##    <chr>   <chr>    <chr>    <chr>      <chr>   <chr> <chr>    <chr>  <chr>     
##  1 48      201      01383886 0500000US… 48201   Harr… Harris … TX     Texas     
##  2 48      029      01383800 0500000US… 48029   Bexar Bexar C… TX     Texas     
##  3 48      371      01383971 0500000US… 48371   Pecos Pecos C… TX     Texas     
##  4 48      453      01384012 0500000US… 48453   Trav… Travis … TX     Texas     
##  5 48      355      01383963 0500000US… 48355   Nuec… Nueces … TX     Texas     
##  6 48      353      01383962 0500000US… 48353   Nolan Nolan C… TX     Texas     
##  7 48      065      01383818 0500000US… 48065   Cars… Carson … TX     Texas     
##  8 48      141      01383855 0500000US… 48141   El P… El Paso… TX     Texas     
##  9 48      245      01383908 0500000US… 48245   Jeff… Jeffers… TX     Texas     
## 10 48      039      01383805 0500000US… 48039   Braz… Brazori… TX     Texas     
## # ℹ 244 more rows
## # ℹ 44 more variables: lsad <chr>, aland <dbl>, awater <dbl>,
## #   geometry.x <MULTIPOLYGON [°]>, tot_pwrplants <int>,
## #   geometry.y <MULTIPOINT [°]>, pwrplant_in_county <dbl>, geoid.y <chr>,
## #   geometry <MULTIPOLYGON [°]>, under5men <dbl>, under5women <dbl>,
## #   white <dbl>, black <dbl>, aian <dbl>, asian <dbl>, nhpi <dbl>, other <dbl>,
## #   multirace <dbl>, hisplat <dbl>, tot_ed <dbl>, diploma <dbl>, ged <dbl>, …

Descriptive Analysis

Now, let’s explore the prevalence of power plants in Texas counties. Power plants are quite common in the state, with two out of three counties with at least one power plant.

# analysis
texas_combined %>%
  group_by(pwrplant_in_county) %>%
  summarise(tot = n()) %>%
  mutate(per = tot / sum(tot))
## # A tibble: 2 × 3
##   pwrplant_in_county   tot   per
##                <dbl> <int> <dbl>
## 1                  0    98 0.386
## 2                  1   156 0.614
# define colors
colors <- c("No" = "#e1d9cf", "Yes" = "#094044")

mapview(
  # modify presence of power plant label
  texas_combined %>%
    st_as_sf() %>%
    mutate(pwrplant_in_county = ifelse(pwrplant_in_county == 1, 'Yes', 'No')),
  zcol = "pwrplant_in_county",
  layer.name = "Presence of power plant?",
  col.regions = colors
)

How are power plants distributed across Texas counties? Harris County leads with 26 power plants, followed by Travis County with 20 and Carson County with 14. Interestingly, the majority of counties—72%—have fewer than 3 power plants. Specifically, single power plants are the most common scenario, present in 40% of counties.

# summary stats of total power plants per county
summary(texas_combined$tot_pwrplants)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    1.00    2.00    3.09    4.00   26.00      98
# percent of counties by total power plants
texas_combined %>% 
    filter(pwrplant_in_county == 1) %>% 
  mutate(tot_pwrplants = as.factor(tot_pwrplants)) %>% 
  group_by(tot_pwrplants) %>% 
  summarise(tot = n()) %>% 
  mutate(per = tot/sum(tot)) %>% 
  arrange(desc(per))
## # A tibble: 13 × 3
##    tot_pwrplants   tot     per
##    <fct>         <int>   <dbl>
##  1 1                62 0.397  
##  2 2                31 0.199  
##  3 3                19 0.122  
##  4 4                13 0.0833 
##  5 6                11 0.0705 
##  6 5                 7 0.0449 
##  7 8                 4 0.0256 
##  8 10                3 0.0192 
##  9 7                 2 0.0128 
## 10 12                1 0.00641
## 11 14                1 0.00641
## 12 20                1 0.00641
## 13 26                1 0.00641
# define colors
color <- c("#102937")

# visual
bar_chart <- texas_combined %>%
  filter(pwrplant_in_county == 1) %>% 
  head(10) %>% 
  hchart("bar", hcaes(x = name, y = tot_pwrplants), color = "#102937") %>%
  hc_add_theme(hc_theme_flat()) %>%  
  hc_title(text = "Harris County leads with 26 power plants") %>%
  hc_yAxis(title = list(text = "Total number of power plants in top ten counties")) %>% 
  hc_xAxis(title = list(text = NULL))

bar_chart

According to the Global Power Plant Database, the majority of power plants in Texas are fueled by renewable energy. 58% of power plants in Texas are powered by wind, solar, biomass, hydro, or waste.

# analysis
powerplant_type <- txpwrplants_sf %>%
  as.tibble() %>%
  mutate(
    primary_fuel = case_when(
      primary_fuel %in% c('Wind', 'Solar', 'Biomass', 'Wind', 'Hydro', 'Waste') ~ 'Renewable',
      primary_fuel %in% c('Gas', 'Coal', 'Oil', 'Petcoke', 'Nuclear') ~ 'Non-Renewable',
      TRUE ~ 'Other'
    )
  ) %>%
  group_by(primary_fuel) %>%
  summarise(total_plants = n()) %>%
  arrange(desc(total_plants)) %>%
  mutate(pct = total_plants / sum(total_plants))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
powerplant_type
## # A tibble: 3 × 3
##   primary_fuel  total_plants    pct
##   <chr>                <int>  <dbl>
## 1 Renewable              281 0.583 
## 2 Non-Renewable          191 0.396 
## 3 Other                   10 0.0207
# define colors
colors <- c("#102937", "#f9744b", "#d6c4b0")

# Create the pie chart with the custom colors
powerplant_type %>%
  mutate(pct = round(pct * 100, 1)) %>%
  hchart("pie", hcaes(x = primary_fuel, y = pct)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_colors(colors) %>%
  hc_title(text = "Renewable energy powers electricity in Texas") %>%
  hc_subtitle(text = "Share of powerplants by fuel type") %>%
  hc_add_theme(hc_theme_ggplot2())
colors <- c("#e1d9cf", "#102937")

texas_combined %>%
  mutate(pwrplant_in_county = ifelse(pwrplant_in_county == 1, 'Yes', 'No')) %>%
  hchart("scatter",
         hcaes(x = medinc, y = rentpct, group = pwrplant_in_county)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_colors(colors) %>%
  hc_title(text = "Income vs. Rent Percentage by Presence of Power Plant") %>%
  hc_xAxis(title = list(text = "Median Income")) %>%
  hc_yAxis(title = list(text = "Rent Percentage")) %>%
  hc_plotOptions(scatter = list(marker = list(
    radius = 3,
    symbol = "circle",
    fillOpacity = 0.6
  ))) %>%
  hc_legend(
    title = list(text = "Presence of power plant?"),
    align = "right",
    verticalAlign = "top",
    layout = "vertical"
  )

Counties that are majority POC have more power plants than white counties.

texas_combined %>% 
  filter(pwrplant_in_county == 1) %>% 
  select(tot_pwrplants, maj_poc) %>% 
  group_by(maj_poc) %>% 
  summarise(total_powerplants = sum(tot_pwrplants))
## # A tibble: 2 × 2
##   maj_poc total_powerplants
##   <chr>               <int>
## 1 poc                   260
## 2 white                 222
texas_combined %>% 
  filter(pwrplant_in_county == 1,
         maj_poc == 'poc') %>% 
  select(name, tot_pwrplants, maj_poc) 
## # A tibble: 64 × 3
##    name      tot_pwrplants maj_poc
##    <chr>             <int> <chr>  
##  1 Harris               26 poc    
##  2 Bexar                20 poc    
##  3 Pecos                14 poc    
##  4 Travis               12 poc    
##  5 Nueces               10 poc    
##  6 El Paso               8 poc    
##  7 Jefferson             8 poc    
##  8 Brazoria              8 poc    
##  9 Howard                7 poc    
## 10 Moore                 6 poc    
## # ℹ 54 more rows

Counties that are majority White are more likely to have a power plant.

# analysis
texas_combined %>% 
  mutate(pwrplant_in_tract = as.factor(pwrplant_in_county)) %>% 
  select(maj_poc, pwrplant_in_county) %>% 
  group_by(pwrplant_in_county, maj_poc) %>% 
  summarise(tot = n()) %>% 
  group_by(pwrplant_in_county) %>% 
  mutate(pct = tot/sum(tot))
## `summarise()` has grouped output by 'pwrplant_in_county'. You can override
## using the `.groups` argument.
## # A tibble: 4 × 4
## # Groups:   pwrplant_in_county [2]
##   pwrplant_in_county maj_poc   tot   pct
##                <dbl> <chr>   <int> <dbl>
## 1                  0 poc        25 0.255
## 2                  0 white      73 0.745
## 3                  1 poc        64 0.410
## 4                  1 white      92 0.590
colors <- c("#e1d9cf", "#102937")


texas_combined %>% 
  mutate(pwrplant_in_county = ifelse(pwrplant_in_county == 1, 'Yes', 'No')) %>%
  select(maj_poc, pwrplant_in_county) %>% 
  group_by(pwrplant_in_county, maj_poc) %>% 
  summarise(tot = n()) %>% 
  group_by(pwrplant_in_county) %>% 
  mutate(pct = round(tot/sum(tot) * 100, 1)) %>%
  hchart("bar", hcaes(x = pwrplant_in_county, y = pct, group = maj_poc)) %>%
  hc_add_theme(hc_theme_flat()) %>%
  hc_colors(colors) %>% 
  hc_title(text = "Most counties with power plants are predominantly white",
           align = "center") %>%
  hc_subtitle(text = "Share of power plant",
              align = "center") %>%
  hc_yAxis(title = list(text = "Percent")) %>%
  hc_xAxis(title = list(text = 'Presence of power plant?')) %>%
  hc_plotOptions(
    bar = list(
      stacking = "percent"
    )
  ) %>%
  hc_legend(align = 'center', verticalAlign = 'top', layout = 'horizontal')
## `summarise()` has grouped output by 'pwrplant_in_county'. You can override
## using the `.groups` argument.

Logistic Regression

Using logistic regression, we examined the relationship between median income, the percentage of renting households, and the percentage of people of color in Texas counties. The results indicate that counties with higher median incomes and higher rental percentages tend to have higher odds of having power plants.

tx_analysis <- texas_combined %>%
  select(
    name,
    tot_pwrplants,
    pwrplant_in_county,
    medinc,
    povpct,
    pocpct,
    nohspct,
    under5children,
    tothome,
    rentpct,
    maj_poc,
    maj_rent,
    gdp2022,
    total_declarations
  )

tx_analysis
## # A tibble: 254 × 14
##    name      tot_pwrplants pwrplant_in_county medinc povpct pocpct nohspct
##    <chr>             <int>              <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
##  1 Harris               26                  1  70789 0.158   0.724  0.179 
##  2 Bexar                20                  1  67275 0.152   0.739  0.141 
##  3 Pecos                14                  1  59325 0.230   0.763  0.241 
##  4 Travis               12                  1  92731 0.113   0.523  0.0918
##  5 Nueces               10                  1  64027 0.173   0.720  0.155 
##  6 Nolan                10                  1  47437 0.155   0.476  0.161 
##  7 Carson               10                  1  83199 0.0750  0.166  0.0758
##  8 El Paso               8                  1  55417 0.195   0.888  0.200 
##  9 Jefferson             8                  1  57294 0.182   0.616  0.152 
## 10 Brazoria              8                  1  91972 0.0753  0.564  0.112 
## # ℹ 244 more rows
## # ℹ 7 more variables: under5children <dbl>, tothome <dbl>, rentpct <dbl>,
## #   maj_poc <chr>, maj_rent <chr>, gdp2022 <int>, total_declarations <int>
glm.fits <- glm(
  pwrplant_in_county ~  medinc + rentpct + pocpct, 
  data = tx_analysis,
  family = binomial
)

summary(glm.fits)
## 
## Call:
## glm(formula = pwrplant_in_county ~ medinc + rentpct + pocpct, 
##     family = binomial, data = tx_analysis)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.748e+00  8.614e-01  -3.191  0.00142 **
## medinc       2.443e-05  9.859e-06   2.478  0.01321 * 
## rentpct      4.879e+00  1.923e+00   2.537  0.01118 * 
## pocpct       8.940e-01  7.304e-01   1.224  0.22090   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 336.85  on 252  degrees of freedom
## Residual deviance: 319.16  on 249  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 327.16
## 
## Number of Fisher Scoring iterations: 4

How could we quickly find which set of variables would be best to predict the presence of a power plat? We look at the Akaike Information Criterion (AIC). After exploring various models using AIC, I found that the best-fitting model included all variables except GDP and the number of disaster declarations. A low AIC score indicates a well-fitting model with fewer variables (source). While AIC allowed us to compare model fits, further training and testing are necessary to enhance prediction accuracy.

# predictor variables
predictors <- c(
  "medinc",
  "povpct",
  "pocpct",
  "nohspct",
  "rentpct",
  "under5children",
  'tothome',
  'gdp2022',
  'total_declarations'
)

# empty formula variable
base_formula <- "pwrplant_in_county ~ 1"

# empty data frame
results <- data.frame(variables = character(),
                      AIC = numeric(),
                      stringsAsFactors = FALSE)

# loop to add variables
for (i in seq_along(predictors)) {
  # update formula
  current_formula <- paste(base_formula, paste(predictors[1:i], collapse = " + "), sep = " + ")
  
  # fit model
  glm.fits <- glm(as.formula(current_formula),
                  data = tx_analysis,
                  family = binomial)
  
  # pull AIC
  aic_score <- AIC(glm.fits)
  
  # combine results
  results <- rbind(results,
                   data.frame(
                     variables = paste(predictors[1:i], collapse = ", "),
                     AIC = aic_score,
                     stringsAsFactors = FALSE
                   ))
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# results
results %>%
  as.tibble %>%
  arrange(AIC)
## # A tibble: 9 × 2
##   variables                                                                  AIC
##   <chr>                                                                    <dbl>
## 1 medinc, povpct, pocpct, nohspct, rentpct, under5children                  317.
## 2 medinc, povpct, pocpct, nohspct, rentpct, under5children, tothome         319.
## 3 medinc, povpct, pocpct, nohspct, rentpct, under5children, tothome, gdp2…  321.
## 4 medinc, povpct, pocpct, nohspct, rentpct, under5children, tothome, gdp2…  321.
## 5 medinc, povpct, pocpct, nohspct, rentpct                                  326.
## 6 medinc, povpct, pocpct                                                    331.
## 7 medinc, povpct, pocpct, nohspct                                           333.
## 8 medinc                                                                    337.
## 9 medinc, povpct                                                            339.

Clustering Using PCA and K-Means

Clustering is a data analysis technique that groups similar observations based on hidden patterns. It helps us see how counties relate to each other.

txcluster_df <- tx_analysis %>%
  filter(!is.na(medinc)) %>%
  mutate(pwrplant_in_county = ifelse(pwrplant_in_county == 1, "Yes", "No")) %>%
  mutate(tot_pwrplants = ifelse(is.na(tot_pwrplants), 0, tot_pwrplants)) %>%
  select(-c(maj_poc, maj_rent))

head(txcluster_df)
## # A tibble: 6 × 12
##   name   tot_pwrplants pwrplant_in_county medinc povpct pocpct nohspct
##   <chr>          <dbl> <chr>               <dbl>  <dbl>  <dbl>   <dbl>
## 1 Harris            26 Yes                 70789  0.158  0.724  0.179 
## 2 Bexar             20 Yes                 67275  0.152  0.739  0.141 
## 3 Pecos             14 Yes                 59325  0.230  0.763  0.241 
## 4 Travis            12 Yes                 92731  0.113  0.523  0.0918
## 5 Nueces            10 Yes                 64027  0.173  0.720  0.155 
## 6 Nolan             10 Yes                 47437  0.155  0.476  0.161 
## # ℹ 5 more variables: under5children <dbl>, tothome <dbl>, rentpct <dbl>,
## #   gdp2022 <int>, total_declarations <int>
pca_rec <- recipe(~., data = txcluster_df) %>%
  update_role(name, pwrplant_in_county, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)

pca_prep
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## predictor: 10
## id:         2
## 
## ── Training information
## Training data contained 253 data points and no incomplete rows.
## 
## ── Operations
## • Centering and scaling for: tot_pwrplants, medinc, povpct, ... | Trained
## • PCA extraction with: tot_pwrplants, medinc, povpct, pocpct, ... | Trained
summary(pca_prep$steps[[2]]$res)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     1.9405 1.6602 1.0423 0.84010 0.78033 0.71470 0.5865
## Proportion of Variance 0.3766 0.2756 0.1086 0.07058 0.06089 0.05108 0.0344
## Cumulative Proportion  0.3766 0.6522 0.7609 0.83142 0.89232 0.94340 0.9778
##                            PC8     PC9    PC10
## Standard deviation     0.42267 0.19958 0.05966
## Proportion of Variance 0.01787 0.00398 0.00036
## Cumulative Proportion  0.99566 0.99964 1.00000

PC1 is about the total number of homes + number of children under 5 + GDP. Since this explains 40% of the variance, these are some of the most important variables. PC2 is mostly about percent of people without a high school education + poverty rate + percent poc.

tidied_pca <- tidy(pca_prep, 2)

# plotting pc sets
tidied_pca %>%
  filter(str_extract(component, "(\\d)+") %>% as.numeric() <= 5) %>%
  group_by(component) %>%
  top_n(n = 8, wt = abs(value)) %>%
  mutate(component = fct_inorder(component)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms)) +
  geom_col(aes(fill = terms)) +
  geom_vline(xintercept = 0) +
  scale_y_reordered() +
  guides(fill = guide_legend(title = "Category")) +
  geom_text(
    aes(label = round(abs(value), 2)),
    position = position_stack(vjust = 0.5),
    size = 3,
    color = "white"
  ) +
  theme_classic() +
  theme(axis.text = element_text(size = 9), 
        legend.position = 'none') +
  facet_wrap( ~ component, 
              ncol = 3, 
              scales = 'free') +
  labs(
    title = 'Comparing PCs', 
      x = "", 
       y = "") 

pca_bake <- bake(pca_prep, new_data = NULL)

pca_bake %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(color = pwrplant_in_county)) +
  labs(title = 'Test PCA Chart') + 
  theme_classic()

We will now cluster the counties in the principal component space using k-means clustering.

set.seed(2024)

pca_kmeans <- pca_bake %>%
  select(where(is.numeric))

pca_kmeans
## # A tibble: 253 × 5
##        PC1    PC2    PC3     PC4     PC5
##      <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
##  1 -21.2   -0.147 -2.14  -3.12   -0.890 
##  2  -9.00   0.520  0.643 -1.18    0.436 
##  3  -1.20   2.12   1.39  -0.599  -0.134 
##  4  -7.35  -1.37  -0.252  1.37    0.601 
##  5  -2.38   0.927  0.201  1.10    0.508 
##  6  -0.733  0.738  0.203 -0.0549  1.03  
##  7  -0.340 -2.64   1.13  -0.789   0.0644
##  8  -3.17   2.07   1.16  -0.883   0.176 
##  9  -1.70   0.847 -0.304  0.799   0.636 
## 10  -2.12  -1.85   0.268  1.63   -1.01  
## # ℹ 243 more rows
# perform k-means clustering for different k values
kclusts <- tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(pca_kmeans, centers = .x)),
    glanced = map(kclust, glance),
    tidyied = map(kclust, tidy),
    augmented = map2(kclust, k, ~ augment(.x, data = pca_kmeans, k = .y))
  )

The ideal number of clusters, or the “elbow point,” is four. We will then separate the counties into four groups.

clusterings <- kclusts %>%
  unnest(cols = c(glanced))

clusterings %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow graph") + 
  theme_classic()

centres <- kclusts %>%
  filter(k == 4) %>%
  unnest(tidyied)

assignments <- kclusts %>%
  filter(k == 4) %>%
  unnest(cols = c(augmented)) %>%
  pull(.cluster)

pca_bake$cluster <- assignments
colors <- c("#102937", "#f9744b", "#d6c4b0", "#b74c7e")

pca_bake %>%
  ggplot(aes(PC1, PC2, color = as.factor(cluster), shape = as.factor(pwrplant_in_county))) +
  geom_text(aes(label = name), color = 'black', check_overlap = TRUE) + 
  geom_point(size = 3) + 
  geom_rug() + 
  scale_shape_manual(values = c(8, 16)) + 
  scale_color_manual(values = colors) + 
  labs(title = 'Clustered PCA Map of TX Counties', 
       shape = 'Presense of power plant?', 
       color = 'Cluster') + 
  theme_classic() +
  theme(legend.position = 'top')

txcluster_df %>%
  left_join(pca_bake %>%
              select(name, cluster), by = 'name') %>%
  relocate(cluster, .after = 'name') %>%
  group_by(cluster) %>%
  summarise(across(where(is.numeric), list(
    mean = mean, std = sd, med = median
  ))) %>%
  select(
    cluster,
    tot_pwrplants_mean,
    medinc_med,
    povpct_mean,
    pocpct_mean,
    nohspct_mean,
    under5children_mean,
    rentpct_mean,
    gdp2022_mean, 
    total_declarations_mean,
    tothome_mean
  )
## # A tibble: 4 × 11
##   cluster tot_pwrplants_mean medinc_med povpct_mean pocpct_mean nohspct_mean
##   <fct>                <dbl>      <dbl>       <dbl>       <dbl>        <dbl>
## 1 1                     2.16     51942       0.218        0.744        0.278
## 2 2                     1.31     56800.      0.163        0.389        0.155
## 3 3                    13.4      70789       0.134        0.655        0.146
## 4 4                     1.84     71625       0.0945       0.355        0.122
## # ℹ 5 more variables: under5children_mean <dbl>, rentpct_mean <dbl>,
## #   gdp2022_mean <dbl>, total_declarations_mean <dbl>, tothome_mean <dbl>

I found the following characteristics for each cluster:

# define colors
colors <- c("#102937", "#f9744b", "#d6c4b0", "#b74c7e")

mapview(
  texas_combined %>%
    left_join(
      pca_bake %>%
        mutate(
          cluster_group = case_when(
            cluster == 1 ~ "<b>Western Border: </b> second highest power plant count, fewest disaster declarations, lower socioeconomic status",
            cluster == 2 ~ "<b> Eastern rural: </b> lowest power plant count, second highest disaster declarations, low socioeconomic status.",
            cluster == 3 ~ "<b> Triangle: </b> highest power plant count, highest disaster declarations, highest GDP and moderate socioeconomic status.",
            cluster == 4 ~ "<b>Suburbs: </b> moderate power plant count, moderate disaster declarations, highest socioeconomic status.",
            TRUE ~ NA_character_
          )
        ) %>%
        select(name, cluster_group, cluster),
      by = 'name'
    ) %>%
    st_as_sf(),
  zcol = "cluster_group",
  layer.name = "Cluster",
  col.regions = colors
)
## Warning: Found more colors (5) than zcol values (4)! 
## Trimming colors to match number of zcol values.

Resources