Electricity cost and VRE penetration

Set up

Code
library(readxl)
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggplot2)
library(fuzzyjoin)
library(lubridate)
library(rpart)
library(rpart.plot)
library(sandwich)
library(lmtest)
library(clubSandwich)
library(car)
library(plm)
library(lmtest)
library(sandwich)
library(plotly)
library(ggrepel)
library(scales)
library(ggforce)

# Load Excel workbook
file_path <- "10-Data_Workbook.xlsx"

# List all sheet names in the workbook
sheets <- excel_sheets(file_path)
print(sheets)
 [1] "Summary"                         "1.1REP"                         
 [3] "1.2GPP"                          "1.3energycpiq"                  
 [5] "1.4OECD"                         "2.1IRENA"                       
 [7] "2.2EI"                           "2.3Em"                          
 [9] "2.4Em"                           "3.1Pop"                         
[11] "3.2PPP"                          "3.3GDP_pp"                      
[13] "Country_Map"                     "Countries"                      
[15] "per-capita-electricity-fossil-n" "installed-solar-pv-capacity"    
[17] "electricity-prod-source-stacked" "share-elec-by-source"           
[19] "per-capita-electricity-source-s"

Load and clean

Load relevant data

Code
# Retail electricity prices
df_elec_cost <- read_excel(file_path, sheet = "1.1REP")

# Electricity prices (from globalpetrolprices.com)
df_gpp_elec_cost <- read_excel(file_path, sheet = "1.2GPP")

# IRENA generation (MW / GWh)
df_IRENA_gen <- read_excel(file_path, sheet = "2.1IRENA") 

# Energy Institute data
df_EI <- read_excel(file_path, sheet = "2.2EI") 

# Ember annual data
df_Ember_annual <- read_excel(file_path, sheet = "2.3Em") 

# Ember monthly data
df_Ember_monthly <- read_excel(file_path, sheet = "2.4Em")

# Country population
df_pop <- read_excel(file_path, sheet = "3.1Pop", col_types = "text")

# Purchasing Power Parity - convert to long
df_ppp <- read_excel(file_path, sheet = "3.2PPP")

df_ppp <- df_ppp %>% 
  pivot_longer(
    cols = `1960`:`2024`,        
    names_to   = "Year",
    values_to  = "ppp",
    values_drop_na = TRUE,      
    names_transform = list(Year = as.integer)  
  )

# Quartlery consumer electricity price index
df_cpi_energy <- read_excel(file_path, sheet = "1.3energycpiq")

# OECD electricity prices
df_oecd_elec <- read_excel(file_path, sheet = "1.4OECD")

# GDP per capita (World Bank)
df_gdp_capita <- read_excel(file_path, sheet = "3.3GDP_pp")


# Create a named list of data frames
dfs <- list(
  elec_cost      = df_elec_cost,
  IRENA_gen      = df_IRENA_gen,
  EI             = df_EI,
  Ember_annual   = df_Ember_annual,
  Ember_monthly  = df_Ember_monthly,
  pop            = df_pop,
  PPP            = df_ppp,
  gpp            = df_gpp_elec_cost,
  cpi_energy     = df_cpi_energy,
  oecd_elec_price = df_oecd_elec,
  gdp_capita = df_gdp_capita
)

Clean country names

Code
# Load country name mapping
country_mapping <- read_excel(file_path, sheet = "Country_Map")

# Create named vector for exact replacement: 'keys' are original and 'values' are revised names
country_map <- setNames(country_mapping$Revised_Name, country_mapping$Name)

# Helper function to replace country names based on exact match
replace_country_names <- function(vec, mapping) {
  # Only update elements that exactly match a key in the mapping
  matches <- vec %in% names(mapping)
  vec[matches] <- mapping[vec[matches]]
  return(vec)
}

# Define a vector that links each data frame (by name in the list) to its country column name
country_cols <- c(
  elec_cost      = "Country",
  IRENA_gen      = "Country/area",
  EI             = "Country",
  Ember_annual   = "Area",
  Ember_monthly  = "Area",
  pop            = "Region, subregion, country or area *",
  PPP            = "Country Name",
  gpp            = "Countries",
  cpi_energy     = "country",
  oecd_elec_price = "Reference area",
  gdp_capita = "Country Name"
)

# Rename countries
dfs <- imap(dfs, ~ {
  col <- country_cols[[.y]]
  .x %>% mutate(!!col := replace_country_names(.data[[col]], country_map))
})

# Standardise country/area column names
standardise_countries <- function(dfs, country_cols, mapping) {
  imap(dfs, ~ {
    df      <- .x
    old_col <- country_cols[[.y]]
    stopifnot(old_col %in% names(df))
    
    df %>%
      # rename whatever the old header was, into Country/Area
      rename(`Country/Area` = all_of(old_col)) %>%
      # trim and map
      mutate(
        `Country/Area` = trimws(`Country/Area`),
        `Country/Area` = replace_country_names(`Country/Area`, mapping)
      )
  })
}

dfs <- standardise_countries(dfs, country_cols, country_map)

Check for countries/areas appearing only in one df

Code
# Create a data frame that collects unique country names from each data frame along with its source
country_df <- imap_dfr(dfs, ~ {
  tibble(
    DataFrame = .y,
    `Country/Area`   = unique(.x$`Country/Area`)
  )
})

# Group by country and keep only those that appear exactly once (i.e. only in one data frame)
unique_countries <- country_df %>%
  group_by(`Country/Area`) %>%
  filter(n() == 1) %>%   # only countries that appear in one data frame
  ungroup() %>% 
  arrange(`Country/Area`)

# Display the two-column table (Country, DataFrame)
print(unique_countries)
# A tibble: 151 × 2
   DataFrame  `Country/Area`                
   <chr>      <chr>                         
 1 gdp_capita Africa Eastern and Southern   
 2 gdp_capita Africa Western and Central    
 3 pop        Americas                      
 4 gdp_capita Arab World                    
 5 pop        Australia/New Zealand         
 6 pop        Caribbean                     
 7 gdp_capita Caribbean small states        
 8 pop        Central America               
 9 pop        Central Asia                  
10 gdp_capita Central Europe and the Baltics
# ℹ 141 more rows

Keep relevant columns in population

Code
# First, create a trimmed version of the column to remove extra spaces
dfs[["pop"]] <- dfs[["pop"]] %>%
  mutate(Total_Population_thousands_Jul_trim = trimws(Total_Population_thousands_Jul))

# Identify and print rows with non-numeric values in the trimmed column
non_numeric <- dfs[["pop"]] %>%
  filter(!grepl("^[0-9.]+$", Total_Population_thousands_Jul_trim)) %>%
  select(Index, Total_Population_thousands_Jul)

if(nrow(non_numeric) > 0) {
  cat("Rows removed due to non-numeric Total_Population_thousands_Jul values:\n")
  print(non_numeric)
} else {
  cat("No non-numeric rows found in Total_Population_thousands_Jul.\n")
}
No non-numeric rows found in Total_Population_thousands_Jul.
Code
# Filter out any rows that don't have a valid numeric value, and then select & convert columns
dfs[["pop"]] <- dfs[["pop"]] %>%
  filter(grepl("^[0-9.]+$", Total_Population_thousands_Jul_trim)) %>%
  select(Index, Variant, `Country/Area`, Year, Total_Population_thousands_Jul) %>%
  mutate(
    # Convert the cleaned column to numeric and multiply by 1000
    Total_Population_thousands_Jul = as.numeric(trimws(Total_Population_thousands_Jul)) * 1000,
    Year = as.numeric(Year)
  ) %>%
  rename(Population = Total_Population_thousands_Jul)

# Check the updated structure
print(dfs[["pop"]])
# A tibble: 22,280 × 5
   Index Variant   `Country/Area`  Year Population
   <chr> <chr>     <chr>          <dbl>      <dbl>
 1 1     Estimates World           1950 2493092848
 2 2     Estimates World           1951 2536927035
 3 3     Estimates World           1952 2584086339
 4 4     Estimates World           1953 2634106235
 5 5     Estimates World           1954 2685894860
 6 6     Estimates World           1955 2740213792
 7 7     Estimates World           1956 2795409994
 8 8     Estimates World           1957 2852618337
 9 9     Estimates World           1958 2911249671
10 10    Estimates World           1959 2965950351
# ℹ 22,270 more rows

Plot

Code
# ——— Parameters —————————————————————————————————————————————————————————
years           <- c(2022, 2023, 2024)
min_demand_pc   <- 1      # MWh per capita cutoff (used only for df_all_flags logic)
max_import_pct  <- 0.5    # 50% max import share for Plot 2

# Countries to circle in both plots
highlight2 <- c(
  "Denmark", "Australia", "Canada", "United States", "Germany", "Belgium",
  "United Kingdom", "Spain", "Netherlands", "Ireland", "Greece", "Luxembourg",
  "Lithuania", "China", "India", "Russia", "Qatar", "South Korea",
  "France", "Iceland", "Iran", "Sweden", "Finland", "Ukraine", "Czech Republic",
  "Barbados", "Singapore", "Lebanon", "Cayman Islands", "Bermuda", "Uruguay",
  "Portugal", "Switzerland", "Austria", "Hong Kong",
  "United Arab Emirates", "New Zealand", "Malta", "Italy", "Cyprus", "Japan",
  "Kuwait", "Namibia", "Chile", "Estonia"
)

# ——— 1. Retail prices 2022–24 —————————————————————————————————————————————
df_price <- dfs[["elec_cost"]] %>%
  pivot_longer(-`Country/Area`, names_to = "Cost_Label", values_to = "price") %>%
  mutate(
    Year = case_when(
      Cost_Label == "2022 Sept" ~ 2022,
      Cost_Label == "2023 Mar"  ~ 2023,
      Cost_Label == "2024 Mar"  ~ 2024,
      TRUE                       ~ NA_real_
    )
  ) %>%
  filter(Year %in% years) %>%
  transmute(
    Country = `Country/Area`,
    Year,
    price
  )

# ——— 2. Fuel shares (%) for 2023 (fill forward from 2022) ————————————————————
df_shares_23 <- dfs[["Ember_annual"]] %>%
  filter(
    `Area type` == "Country",
    Category    == "Electricity generation",
    Subcategory == "Fuel",
    Unit        == "%",
    Year %in% c(2022, 2023)
  ) %>%
  select(Year, `Country/Area`, Variable, Value) %>%
  pivot_wider(
    names_from  = Variable,
    values_from = Value,
    values_fill = 0
  ) %>%
  rename(Country = `Country/Area`) %>%
  arrange(Country, Year) %>%
  group_by(Country) %>%
  fill(
    Bioenergy, Coal, Gas, Hydro, Nuclear,
    `Other Fossil`, `Other Renewables`,
    Solar, Wind,
    .direction = "down"
  ) %>%
  ungroup() %>%
  filter(Year == 2023) %>%
  transmute(
    Country,
    share_bioenergy        = Bioenergy,
    share_coal             = Coal,
    share_gas              = Gas,
    share_hydro            = Hydro,
    share_nuclear          = Nuclear,
    share_other_fossil     = `Other Fossil`,
    share_other_renewables = `Other Renewables`,
    share_solar            = Solar,
    share_wind             = Wind,
    wind_solar_share       = Solar + Wind
  )

# ——— 3. Electricity demand & per-capita for 2023 —————————————————————————————————
df_demand_23 <- dfs[["Ember_annual"]] %>%
  filter(
    Category    == "Electricity demand",
    Subcategory %in% c("Demand", "Demand per capita"),
    Year        == 2023
  ) %>%
  select(`Country/Area`, Subcategory, Value) %>%
  pivot_wider(
    id_cols     = `Country/Area`,
    names_from  = Subcategory,
    values_from = Value
  ) %>%
  rename(
    Country       = `Country/Area`,
    demand_TWh    = Demand,
    demand_pc_MWh = `Demand per capita`
  )

# ——— 4. GDP per capita: use 2023 if available, else 2022 ——————————————————————
df_gdp_23 <- dfs[["gdp_capita"]] %>%
  select(`Country/Area`, `2022`, `2023`) %>%
  rename(
    Country  = `Country/Area`,
    gdp_2022 = `2022`,
    gdp_2023 = `2023`
  ) %>%
  mutate(
    gdp_pc = coalesce(gdp_2023, gdp_2022)
  ) %>%
  select(Country, gdp_pc)

# ——— 5. Net imports for 2022 & 2023, compute import_pct (off demand) —━━━━━━━━━━━━━
df_imp_all <- dfs[["Ember_annual"]] %>%
  filter(
    Category == "Electricity imports",
    Variable == "Net Imports",
    Year %in% c(2022, 2023)
  ) %>%
  select(Year, `Country/Area`, Value) %>%
  pivot_wider(
    names_from   = Year,
    names_prefix = "imp_",
    values_from  = Value,
    values_fill  = 0
  ) %>%
  rename(Country = `Country/Area`) %>%
  mutate(
    net_import = coalesce(imp_2023, imp_2022)
  ) %>%
  select(Country, net_import)

# ——— 6. Build full 2023 dataset (before filtering) —————————————————————————————
df_plot_23_raw <- df_price %>%
  filter(Year == 2023) %>%
  inner_join(df_shares_23,  by = "Country") %>%
  inner_join(df_demand_23,  by = "Country") %>%
  inner_join(df_gdp_23,     by = "Country") %>%
  inner_join(df_imp_all,    by = "Country") %>%
  mutate(
    import_pct = net_import / demand_TWh,
    sum_fossil           = share_coal + share_gas + share_other_fossil,
    sum_hydro_nuclear    = share_hydro + share_nuclear,
    sum_wind_solar       = share_wind + share_solar,
    sum_other_renewables = share_bioenergy + share_other_renewables,
    largest_source = factor(
      case_when(
        sum_fossil           >= sum_hydro_nuclear &
        sum_fossil           >= sum_wind_solar &
        sum_fossil           >= sum_other_renewables ~ "Coal & Fossil Fuels",
        sum_hydro_nuclear    >= sum_fossil &
        sum_hydro_nuclear    >= sum_wind_solar &
        sum_hydro_nuclear    >= sum_other_renewables ~ "Hydro & Nuclear",
        sum_wind_solar       >= sum_fossil &
        sum_wind_solar       >= sum_hydro_nuclear &
        sum_wind_solar       >= sum_other_renewables ~ "Wind & Solar",
        TRUE                                            ~ "Bioenergy & Other Renewables"
      ),
      levels = c(
        "Coal & Fossil Fuels",
        "Hydro & Nuclear",
        "Wind & Solar",
        "Bioenergy & Other Renewables"
      )
    ),
    in_plot1 = TRUE
  )

# ——— 7. Shaded regions & midpoints for Plot 1 ——————————————————————————————————
lm1   <- lm(price ~ wind_solar_share, data = df_plot_23_raw)
r2_1  <- summary(lm1)$r.squared
ymax1 <- max(pretty_breaks(5)(c(0, df_plot_23_raw$price)))

# Define buckets and rectangles
breaks1        <- c(0, 21, 33, Inf)
bucket_labels1 <- c("< 21%", "21–33%", "≥ 33%")
rects1 <- tibble(
  xmin   = c(-Inf, 21, 33),
  xmax   = c(21,    33, Inf),
  bucket = bucket_labels1
)

# Compute midpoint‐x and fitted y for each bucket
mid_df1 <- tibble(
  bucket = bucket_labels1,
  x_mid  = c(
    (0 + 21)/2,
    (21 + 33)/2,
    (33 + max(df_plot_23_raw$wind_solar_share, na.rm = TRUE))/2
  )
) %>%
  mutate(
    y_pred = predict(lm1, newdata = data.frame(wind_solar_share = x_mid))
  )

# ——— 8. Plot 1: unfiltered scatter (all countries + shaded regions) —————————————————
y_brks1 <- pretty_breaks(5)(c(0, df_plot_23_raw$price))

ggplot() +
  # shaded buckets
  geom_rect(
    data        = rects1,
    aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = ymax1, fill = bucket),
    inherit.aes = FALSE,
    alpha       = 0.2
  ) +
  scale_fill_manual(
    values = c(
      "< 21%"  = "#fee5d9",
      "21–33%" = "#fcae91",
      "≥ 33%"  = "#de2d26"
    ),
    guide = FALSE
  ) +

  # regression line
  geom_smooth(
    data    = df_plot_23_raw,
    aes(x = wind_solar_share, y = price),
    method  = "lm", formula = y ~ x,
    se      = FALSE, colour = "black"
  ) +

  # all points
  geom_point(
    data = df_plot_23_raw,
    aes(x = wind_solar_share, y = price, colour = largest_source, size = demand_TWh),
    shape = 16, alpha = 0.7
  ) +

  # highlight rings
  geom_point(
    data        = df_plot_23_raw %>% filter(Country %in% highlight2),
    aes(x = wind_solar_share, y = price, size = demand_TWh),
    shape       = 21,
    fill        = NA,
    colour      = "black",
    stroke      = 0.8,
    show.legend = FALSE
  ) +

  # country labels for highlighted points
  geom_text_repel(
    data    = df_plot_23_raw %>% filter(Country %in% highlight2),
    aes(x = wind_solar_share, y = price, label = Country),
    colour        = "black",
    size          = 4,
    fontface      = "plain",
    max.overlaps  = Inf,
    point.padding = unit(0.4, "lines"),   
    box.padding   = unit(0.4, "lines"),
    segment.size  = 0.5
  ) +

  # X marks at bucket midpoints
  geom_point(
    data   = mid_df1,
    aes(x = x_mid, y = y_pred),
    shape  = 4,
    size   = 4,
    stroke = 1.2,
    colour = "black"
  ) +
  geom_text(
    data      = mid_df1,
    aes(x = x_mid, y = y_pred, label = scales::dollar(y_pred)),
    vjust     = -1.2,          # move label further above the X
    fontface  = "bold",
    size      = 4
  ) +

  # R² annotation
  annotate(
    "text", x = Inf, y = -Inf,
    label = paste0("R² = ", round(r2_1, 2)),
    hjust = 1.1, vjust = -0.5, size = 3
  ) +

  scale_colour_manual(
    values = c(
      "Coal & Fossil Fuels"           = "#E6550D",
      "Hydro & Nuclear"               = "#3182BD",
      "Wind & Solar"                  = "#31A354",
      "Bioenergy & Other Renewables"  = "gray50"
    ),
    name = "Largest Generation Source",
    guide = guide_legend(
      override.aes = list(size = 6)   # enlarge legend dots
    )
  ) +
  scale_size_continuous(
    range = c(3, 9),
    name  = "Demand (TWh)",
    guide = guide_legend(
      order        = 2,
      override.aes = list(shape = 16, linetype = 0),
      key_glyph    = draw_key_point
    )
  ) +
  scale_y_continuous(
    limits = c(0, ymax1),
    breaks = y_brks1,
    expand = expansion(mult = c(0, 0))
  ) +
  labs(
    title    = "2023 Retail Electricity Price vs Wind + Solar Share (All Countries)",
    subtitle = NULL,
    caption  = "Source: World Population Review & Ember",
    x        = "Wind + Solar Generation Share (%)",
    y        = "Price (US $/kWh)"
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor   = element_blank(),
    axis.line.x        = element_line(colour = "black"),
    axis.line.y        = element_line(colour = "black"),
    axis.ticks.length  = unit(-2, "pt"),
    axis.text.x        = element_text(margin = margin(t = 5), colour = "black", size = 12),
    axis.text.y        = element_text(margin = margin(r = 5), colour = "black", size = 12),
    axis.title.x       = element_text(margin = margin(t = 10), size = 14),
    axis.title.y       = element_text(margin = margin(r = 10), size = 14),
    legend.position    = "right"
  )

Code
# ——— 9. Apply filters for Plot 2 and capture flags —━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
df_filtered <- df_plot_23_raw %>%
  mutate(in_plot2 = (import_pct <= max_import_pct))

df_top30_plot2 <- df_filtered %>%
  filter(in_plot2) %>%
  arrange(desc(gdp_pc)) %>%
  slice_head(n = 30) %>%
  mutate(in_plot2 = TRUE)

df_all_flags <- df_plot_23_raw %>%
  left_join(
    df_top30_plot2 %>% select(Country, in_plot2),
    by = "Country"
  ) %>%
  mutate(
    in_plot2 = replace_na(in_plot2, FALSE)
  )

# ——— 10. Shaded regions & midpoints for Plot 2 —━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
lm2   <- lm(price ~ wind_solar_share, data = df_top30_plot2)
r2_2  <- summary(lm2)$r.squared
ymax2 <- max(pretty_breaks(5)(c(0, df_top30_plot2$price)))

# Define buckets and rectangles
breaks2        <- c(0, 10, 20, Inf)
bucket_labels2 <- c("< 10%", "10–20%", "≥ 20%")
rects2 <- tibble(
  xmin   = c(-Inf, 10, 20),
  xmax   = c(10,    20, Inf),
  bucket = bucket_labels2
)

# Compute midpoint‐x and fitted y for each bucket
mid_df2 <- tibble(
  bucket = bucket_labels2,
  x_mid  = c(
    (0 + 10)/2,
    (10 + 20)/2,
    (20 + max(df_top30_plot2$wind_solar_share, na.rm = TRUE))/2
  )
) %>%
  mutate(
    y_pred = predict(lm2, newdata = data.frame(wind_solar_share = x_mid))
  )

# ——— 11. Plot 2: top-30 by GDP/pp, import ≤ 50% (with shaded regions) —━━━━━━━━━━━━━━━━━━
y_brks2 <- pretty_breaks(5)(c(0, df_top30_plot2$price))

ggplot() +
  # shaded buckets
  geom_rect(
    data        = rects2,
    aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = ymax2, fill = bucket),
    inherit.aes = FALSE,
    alpha       = 0.2
  ) +
  scale_fill_manual(
    values = c(
      "< 10%"  = "#fee5d9",
      "10–20%" = "#fcae91",
      "≥ 20%"  = "#de2d26"
    ),
    guide = FALSE
  ) +

  # regression line
  geom_smooth(
    data    = df_top30_plot2,
    aes(x = wind_solar_share, y = price),
    method  = "lm", formula = y ~ x,
    se      = FALSE, colour = "black"
  ) +

  # all points
  geom_point(
    data = df_top30_plot2,
    aes(x = wind_solar_share, y = price, colour = largest_source, size = demand_TWh),
    shape = 16, alpha = 0.7
  ) +

  # highlight rings
  geom_point(
    data        = df_top30_plot2 %>% filter(Country %in% highlight2),
    aes(x = wind_solar_share, y = price, size = demand_TWh),
    shape       = 21,
    fill        = NA,
    colour      = "black",
    stroke      = 0.8,
    show.legend = FALSE
  ) +

  # country labels for highlighted points
  geom_text_repel(
    data    = df_top30_plot2 %>% filter(Country %in% highlight2),
    aes(x = wind_solar_share, y = price, label = Country),
    colour        = "black",
    size          = 4,
    fontface      = "plain",
    max.overlaps  = Inf,
    point.padding = unit(0.75, "lines"),
    box.padding   = unit(0.4, "lines"),
    segment.size  = 0.5
  ) +

  # X marks at bucket midpoints
  geom_point(
    data   = mid_df2,
    aes(x = x_mid, y = y_pred),
    shape  = 4,
    size   = 4,
    stroke = 1.2,
    colour = "black"
  ) +
  geom_text_repel(
  data          = mid_df2,
  aes(x         = x_mid, 
      y         = y_pred, 
      label     = scales::dollar(y_pred)),
  nudge_y       = 0.01,            # small upwards nudge to avoid sitting exactly on the X
  fontface      = "bold",
  size          = 4,
  max.overlaps  = Inf,             # keep trying until it stops overlapping
  point.padding = unit(1, "lines"),
  box.padding   = unit(0.5, "lines"),
  segment.size  = 0.3,
  inherit.aes   = FALSE
) +

  # R² annotation
  annotate(
    "text", x = Inf, y = -Inf,
    label = paste0("R² = ", round(r2_2, 2)),
    hjust = 1.1, vjust = -0.5, size = 3
  ) +

  scale_colour_manual(
    values = c(
      "Coal & Fossil Fuels"           = "#E6550D",
      "Hydro & Nuclear"               = "#3182BD",
      "Wind & Solar"                  = "#31A354",
      "Bioenergy & Other Renewables"  = "gray50"
    ),
    name = "Largest Generation Source",
    guide = guide_legend(
      override.aes = list(size = 6)
    )
  ) +
  scale_size_continuous(
    range = c(3, 9),
    name  = "Demand (TWh)",
    guide = guide_legend(
      order        = 2,
      override.aes = list(shape = 16, linetype = 0),
      key_glyph    = draw_key_point
    )
  ) +
  scale_y_continuous(
    limits = c(0, ymax2),
    breaks = y_brks2,
    expand = expansion(mult = c(0, 0))
  ) +
  labs(
    title    = "2023 Retail Electricity Price vs Wind + Solar Share",
    subtitle = "Top 30 GDP/pp countries producing > 50% of their electricity domestically",
    caption  = "Source: World Population Review & Ember",
    x        = "Wind + Solar Generation Share (%)",
    y        = "Price (US $/kWh)"
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor   = element_blank(),
    axis.line.x        = element_line(colour = "black"),
    axis.line.y        = element_line(colour = "black"),
    axis.ticks.length  = unit(-2, "pt"),
    axis.text.x        = element_text(margin = margin(t = 5), colour = "black", size = 12),
    axis.text.y        = element_text(margin = margin(r = 5), colour = "black", size = 12),
    axis.title.x       = element_text(margin = margin(t = 10), size = 14),
    axis.title.y       = element_text(margin = margin(r = 10), size = 14),
    legend.position    = "right"
  )

Code
# ——— 12. df_all_flags: show which countries appear in Plot 1 and Plot 2 —━━━━━━━━━━━
print(df_all_flags)
# A tibble: 131 × 25
   Country        Year  price share_bioenergy share_coal share_gas share_hydro
   <chr>         <dbl>  <dbl>           <dbl>      <dbl>     <dbl>       <dbl>
 1 India          2023 0.0731            1.9        75.1      2.69        7.62
 2 China          2023 0.0791            2.1        60.8      3.15       13.0 
 3 United States  2023 0.175             1.11       15.9     42.5         5.62
 4 Indonesia      2023 0.0951            6.41       61.8     17.7         7.01
 5 Pakistan       2023 0.0371            1.06       15.3     26.2        18.9 
 6 Nigeria        2023 0.0491            0.14        0       77.1        22.5 
 7 Brazil         2023 0.197             7.86        2        5.31       60.4 
 8 Bangladesh     2023 0.0541            0          10.1     67.7         0.61
 9 Russia         2023 0.0641            0.07       17.9     44.8        17.0 
10 Ethiopia       2023 0.0061            0.05        0        0          96.5 
# ℹ 121 more rows
# ℹ 18 more variables: share_nuclear <dbl>, share_other_fossil <dbl>,
#   share_other_renewables <dbl>, share_solar <dbl>, share_wind <dbl>,
#   wind_solar_share <dbl>, demand_TWh <dbl>, demand_pc_MWh <dbl>,
#   gdp_pc <dbl>, net_import <dbl>, import_pct <dbl>, sum_fossil <dbl>,
#   sum_hydro_nuclear <dbl>, sum_wind_solar <dbl>, sum_other_renewables <dbl>,
#   largest_source <fct>, in_plot1 <lgl>, in_plot2 <lgl>