library(readxl)library(tidyverse)library(dplyr)library(tidyr)library(ggplot2)library(fuzzyjoin)library(lubridate)library(rpart)library(rpart.plot)library(sandwich)library(lmtest)library(clubSandwich)library(car)library(plm)library(lmtest)library(sandwich)library(plotly)library(ggrepel)library(scales)library(ggforce)# Load Excel workbookfile_path <-"10-Data_Workbook.xlsx"# List all sheet names in the workbooksheets <-excel_sheets(file_path)print(sheets)
# Retail electricity pricesdf_elec_cost <-read_excel(file_path, sheet ="1.1REP")# Electricity prices (from globalpetrolprices.com)df_gpp_elec_cost <-read_excel(file_path, sheet ="1.2GPP")# IRENA generation (MW / GWh)df_IRENA_gen <-read_excel(file_path, sheet ="2.1IRENA") # Energy Institute datadf_EI <-read_excel(file_path, sheet ="2.2EI") # Ember annual datadf_Ember_annual <-read_excel(file_path, sheet ="2.3Em") # Ember monthly datadf_Ember_monthly <-read_excel(file_path, sheet ="2.4Em")# Country populationdf_pop <-read_excel(file_path, sheet ="3.1Pop", col_types ="text")# Purchasing Power Parity - convert to longdf_ppp <-read_excel(file_path, sheet ="3.2PPP")df_ppp <- df_ppp %>%pivot_longer(cols =`1960`:`2024`, names_to ="Year",values_to ="ppp",values_drop_na =TRUE, names_transform =list(Year = as.integer) )# Quartlery consumer electricity price indexdf_cpi_energy <-read_excel(file_path, sheet ="1.3energycpiq")# OECD electricity pricesdf_oecd_elec <-read_excel(file_path, sheet ="1.4OECD")# GDP per capita (World Bank)df_gdp_capita <-read_excel(file_path, sheet ="3.3GDP_pp")# Create a named list of data framesdfs <-list(elec_cost = df_elec_cost,IRENA_gen = df_IRENA_gen,EI = df_EI,Ember_annual = df_Ember_annual,Ember_monthly = df_Ember_monthly,pop = df_pop,PPP = df_ppp,gpp = df_gpp_elec_cost,cpi_energy = df_cpi_energy,oecd_elec_price = df_oecd_elec,gdp_capita = df_gdp_capita)
Clean country names
Code
# Load country name mappingcountry_mapping <-read_excel(file_path, sheet ="Country_Map")# Create named vector for exact replacement: 'keys' are original and 'values' are revised namescountry_map <-setNames(country_mapping$Revised_Name, country_mapping$Name)# Helper function to replace country names based on exact matchreplace_country_names <-function(vec, mapping) {# Only update elements that exactly match a key in the mapping matches <- vec %in%names(mapping) vec[matches] <- mapping[vec[matches]]return(vec)}# Define a vector that links each data frame (by name in the list) to its country column namecountry_cols <-c(elec_cost ="Country",IRENA_gen ="Country/area",EI ="Country",Ember_annual ="Area",Ember_monthly ="Area",pop ="Region, subregion, country or area *",PPP ="Country Name",gpp ="Countries",cpi_energy ="country",oecd_elec_price ="Reference area",gdp_capita ="Country Name")# Rename countriesdfs <-imap(dfs, ~ { col <- country_cols[[.y]] .x %>%mutate(!!col :=replace_country_names(.data[[col]], country_map))})# Standardise country/area column namesstandardise_countries <-function(dfs, country_cols, mapping) {imap(dfs, ~ { df <- .x old_col <- country_cols[[.y]]stopifnot(old_col %in%names(df)) df %>%# rename whatever the old header was, into Country/Arearename(`Country/Area`=all_of(old_col)) %>%# trim and mapmutate(`Country/Area`=trimws(`Country/Area`),`Country/Area`=replace_country_names(`Country/Area`, mapping) ) })}dfs <-standardise_countries(dfs, country_cols, country_map)
Check for countries/areas appearing only in one df
Code
# Create a data frame that collects unique country names from each data frame along with its sourcecountry_df <-imap_dfr(dfs, ~ {tibble(DataFrame = .y,`Country/Area`=unique(.x$`Country/Area`) )})# Group by country and keep only those that appear exactly once (i.e. only in one data frame)unique_countries <- country_df %>%group_by(`Country/Area`) %>%filter(n() ==1) %>%# only countries that appear in one data frameungroup() %>%arrange(`Country/Area`)# Display the two-column table (Country, DataFrame)print(unique_countries)
# A tibble: 151 × 2
DataFrame `Country/Area`
<chr> <chr>
1 gdp_capita Africa Eastern and Southern
2 gdp_capita Africa Western and Central
3 pop Americas
4 gdp_capita Arab World
5 pop Australia/New Zealand
6 pop Caribbean
7 gdp_capita Caribbean small states
8 pop Central America
9 pop Central Asia
10 gdp_capita Central Europe and the Baltics
# ℹ 141 more rows
Keep relevant columns in population
Code
# First, create a trimmed version of the column to remove extra spacesdfs[["pop"]] <- dfs[["pop"]] %>%mutate(Total_Population_thousands_Jul_trim =trimws(Total_Population_thousands_Jul))# Identify and print rows with non-numeric values in the trimmed columnnon_numeric <- dfs[["pop"]] %>%filter(!grepl("^[0-9.]+$", Total_Population_thousands_Jul_trim)) %>%select(Index, Total_Population_thousands_Jul)if(nrow(non_numeric) >0) {cat("Rows removed due to non-numeric Total_Population_thousands_Jul values:\n")print(non_numeric)} else {cat("No non-numeric rows found in Total_Population_thousands_Jul.\n")}
No non-numeric rows found in Total_Population_thousands_Jul.
Code
# Filter out any rows that don't have a valid numeric value, and then select & convert columnsdfs[["pop"]] <- dfs[["pop"]] %>%filter(grepl("^[0-9.]+$", Total_Population_thousands_Jul_trim)) %>%select(Index, Variant, `Country/Area`, Year, Total_Population_thousands_Jul) %>%mutate(# Convert the cleaned column to numeric and multiply by 1000Total_Population_thousands_Jul =as.numeric(trimws(Total_Population_thousands_Jul)) *1000,Year =as.numeric(Year) ) %>%rename(Population = Total_Population_thousands_Jul)# Check the updated structureprint(dfs[["pop"]])
# A tibble: 22,280 × 5
Index Variant `Country/Area` Year Population
<chr> <chr> <chr> <dbl> <dbl>
1 1 Estimates World 1950 2493092848
2 2 Estimates World 1951 2536927035
3 3 Estimates World 1952 2584086339
4 4 Estimates World 1953 2634106235
5 5 Estimates World 1954 2685894860
6 6 Estimates World 1955 2740213792
7 7 Estimates World 1956 2795409994
8 8 Estimates World 1957 2852618337
9 9 Estimates World 1958 2911249671
10 10 Estimates World 1959 2965950351
# ℹ 22,270 more rows