Data Cleaning Final Project

Cleaning Data for the Final Project

International Energy Agency Data

  1. Load libraries and packages
#loading libraries
options(repos = c(CRAN = "https://cran.rstudio.com/"))
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.1     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(countrycode)
library(dplyr)
library(ggplot2)
library(lubridate)
library(readxl)
install.packages("xfun")
Installing package into 'C:/Users/alyss/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)
package 'xfun' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'xfun'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\alyss\AppData\Local\R\win-library\4.2\00LOCK\xfun\libs\x64\xfun.dll to
C:\Users\alyss\AppData\Local\R\win-library\4.2\xfun\libs\x64\xfun.dll:
Permission denied
Warning: restored 'xfun'

The downloaded binary packages are in
    C:\Users\alyss\AppData\Local\Temp\RtmpAnZtCM\downloaded_packages
library(xfun)

Attaching package: 'xfun'

The following objects are masked from 'package:base':

    attr, isFALSE
library(esquisse)
install.packages("stringr")
Warning: package 'stringr' is in use and will not be installed
library(stringr)
library(openxlsx)
  1. Import the data, rename variables that have confusing or similar names, and pivot the data to follow the “tidy” data rules.
#read in data

Raw_World_Investment_Data <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 1)

# renaming variables that have the same names 

Raw_World_Investment_Data[3, "World"] <- "Total Supply"
Raw_World_Investment_Data[8, "World"] <- "Total End Use"
Raw_World_Investment_Data[12, "World"] <- "Total Fuels"
Raw_World_Investment_Data[13, "World"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data[14, "World"] <- "Fuels_Oil"
Raw_World_Investment_Data[15, "World"] <- "Fuels_Gas"
Raw_World_Investment_Data[16, "World"] <- "Fuels_Coal"
Raw_World_Investment_Data[17, "World"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data[18, "World"] <- "Total Power"
Raw_World_Investment_Data[19, "World"] <- "Power_Generation"
Raw_World_Investment_Data[20, "World"] <- "Power_Coal"
Raw_World_Investment_Data[21, "World"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data[22, "World"] <- "Power_Nuclear"
Raw_World_Investment_Data[23, "World"] <- "Power_Renewables"
Raw_World_Investment_Data[24, "World"] <- "Power_Fossil_Fuels_CCUS"
Raw_World_Investment_Data[25, "World"] <- "Power_Storage"
Raw_World_Investment_Data[2, "World"] <- "Total Clean Energy"
Raw_World_Investment_Data[6, "World"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data[26, "World"] <- "Power_Electricity Networks"

# pivot longer to make Year column

long_Investment_data <- pivot_longer(Raw_World_Investment_Data, 
  cols = -World,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data <- pivot_wider(long_Investment_data,
  names_from = World, 
  values_from = Value
)
  1. Repeat manually or create a function for the remaining sheets

Above, I cleaned the first sheet of the IEA data, but the excel file I’m using has 6 tabs of data in a similar format. Here you can create a function to clean all sheets in a similar fashion, or manually run the same code multiple times. As I was updating unique variable names, I copied the code below for the remaining sheets.

Sheet 2:

# Sheet 2 

Raw_World_Investment_Data_2 <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 2)

# renaming variables that have the same names 

Raw_World_Investment_Data_2[3, "Emerging market and developing economies"] <- "Total Supply"

View(Raw_World_Investment_Data_2)

Raw_World_Investment_Data_2[8, "Emerging market and developing economies"] <- "Total End Use"
Raw_World_Investment_Data_2[12, "Emerging market and developing economies"] <- "Total Fuels"
Raw_World_Investment_Data_2[13, "Emerging market and developing economies"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data_2[14, "Emerging market and developing economies"] <- "Fuels_Oil"
Raw_World_Investment_Data_2[15, "Emerging market and developing economies"] <- "Fuels_Gas"
Raw_World_Investment_Data_2[16, "Emerging market and developing economies"] <- "Fuels_Coal"
Raw_World_Investment_Data_2[17, "Emerging market and developing economies"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data_2[18, "Emerging market and developing economies"] <- "Total Power"
Raw_World_Investment_Data_2[19, "Emerging market and developing economies"] <- "Power_Generation"
Raw_World_Investment_Data_2[20, "Emerging market and developing economies"] <- "Power_Coal"
Raw_World_Investment_Data_2[21, "Emerging market and developing economies"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data_2[22, "Emerging market and developing economies"] <- "Power_Nuclear"
Raw_World_Investment_Data_2[23, "Emerging market and developing economies"] <- "Power_Renewables"
Raw_World_Investment_Data_2[24, "Emerging market and developing economies"] <- "Power_Fossil_Fuels_CCUS"
Raw_World_Investment_Data_2[25, "Emerging market and developing economies"] <- "Power_Storage"
Raw_World_Investment_Data_2[2, "Emerging market and developing economies"] <- "Total Clean Energy"
Raw_World_Investment_Data_2[6, "Emerging market and developing economies"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data_2[26, "Emerging market and developing economies"] <- "Power_Electricity Networks"

# pivot longer to create year column
long_Investment_data_2 <- pivot_longer(Raw_World_Investment_Data_2, 
  cols = -`Emerging market and developing economies`,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data_2 <- pivot_wider(long_Investment_data_2,
  names_from = `Emerging market and developing economies`, 
  values_from = Value
)

# delete extraneous row
wide_Investment_data_2_2 <- wide_Investment_data_2[-10, ]

Sheet 3:

# Sheet 3 

Raw_World_Investment_Data_3 <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 3)

# renaming variables that have the same names 
Raw_World_Investment_Data_3[3, "Central and South America"] <- "Total Supply"

View(Raw_World_Investment_Data_3)

Raw_World_Investment_Data_3[8, "Central and South America"] <- "Total End Use"
Raw_World_Investment_Data_3[12, "Central and South America"] <- "Total Fuels"
Raw_World_Investment_Data_3[13, "Central and South America"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data_3[14, "Central and South America"] <- "Fuels_Oil"
Raw_World_Investment_Data_3[15, "Central and South America"] <- "Fuels_Gas"
Raw_World_Investment_Data_3[16, "Central and South America"] <- "Fuels_Coal"
Raw_World_Investment_Data_3[17, "Central and South America"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data_3[18, "Central and South America"] <- "Total Power"
Raw_World_Investment_Data_3[19, "Central and South America"] <- "Power_Generation"
Raw_World_Investment_Data_3[20, "Central and South America"] <- "Power_Coal"
Raw_World_Investment_Data_3[21, "Central and South America"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data_3[22, "Central and South America"] <- "Power_Nuclear"
Raw_World_Investment_Data_3[23, "Central and South America"] <- "Power_Renewables"
Raw_World_Investment_Data_3[24, "Central and South America"] <- "Power_Fossil_Fuels_CCUS"
Raw_World_Investment_Data_3[25, "Central and South America"] <- "Power_Storage"
Raw_World_Investment_Data_3[2, "Central and South America"] <- "Total Clean Energy"
Raw_World_Investment_Data_3[6, "Central and South America"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data_3[26, "Central and South America"] <- "Power_Electricity Networks"

# pivot longer to create year column
long_Investment_data_3 <- pivot_longer(Raw_World_Investment_Data_3, 
  cols = -`Central and South America`,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data_3 <- pivot_wider(long_Investment_data_3,
  names_from = `Central and South America`, 
  values_from = Value
)

Sheet 4:

# sheet 4
Raw_World_Investment_Data_4 <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 4)

# renaming variables that have the same names 
Raw_World_Investment_Data_4[3, "Africa"] <- "Total Supply"

View(Raw_World_Investment_Data_4)

Raw_World_Investment_Data_4[8, "Africa"] <- "Total End Use"
Raw_World_Investment_Data_4[12, "Africa"] <- "Total Fuels"
Raw_World_Investment_Data_4[13, "Africa"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data_4[14, "Africa"] <- "Fuels_Oil"
Raw_World_Investment_Data_4[15, "Africa"] <- "Fuels_Gas"
Raw_World_Investment_Data_4[16, "Africa"] <- "Fuels_Coal"
Raw_World_Investment_Data_4[17, "Africa"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data_4[18, "Africa"] <- "Total Power"
Raw_World_Investment_Data_4[19, "Africa"] <- "Power_Generation"
Raw_World_Investment_Data_4[20, "Africa"] <- "Power_Coal"
Raw_World_Investment_Data_4[21, "Africa"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data_4[22, "Africa"] <- "Power_Nuclear"
Raw_World_Investment_Data_4[23, "Africa"] <- "Power_Renewables"
Raw_World_Investment_Data_4[24, "Africa"] <- "Power_Fossil_Fuels_CCUS"
Raw_World_Investment_Data_4[25, "Africa"] <- "Power_Storage"
Raw_World_Investment_Data_4[2, "Africa"] <- "Total Clean Energy"
Raw_World_Investment_Data_4[6, "Africa"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data_4[26, "Africa"] <- "Power_Electricity Networks"

# pivot longer to create year column
long_Investment_data_4 <- pivot_longer(Raw_World_Investment_Data_4, 
  cols = -`Africa`,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data_4 <- pivot_wider(long_Investment_data_4,
  names_from = `Africa`, 
  values_from = Value
)

Sheet 5:

# Sheet 5
Raw_World_Investment_Data_5 <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 5)

# renaming variables that have the same names 
Raw_World_Investment_Data_5[3, "Asia Pacific"] <- "Total Supply"

View(Raw_World_Investment_Data_5)

Raw_World_Investment_Data_5[8, "Asia Pacific"] <- "Total End Use"
Raw_World_Investment_Data_5[12, "Asia Pacific"] <- "Total Fuels"
Raw_World_Investment_Data_5[13, "Asia Pacific"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data_5[14, "Asia Pacific"] <- "Fuels_Oil"
Raw_World_Investment_Data_5[15, "Asia Pacific"] <- "Fuels_Gas"
Raw_World_Investment_Data_5[16, "Asia Pacific"] <- "Fuels_Coal"
Raw_World_Investment_Data_5[17, "Asia Pacific"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data_5[18, "Asia Pacific"] <- "Total Power"
Raw_World_Investment_Data_5[19, "Asia Pacific"] <- "Power_Generation"
Raw_World_Investment_Data_5[20, "Asia Pacific"] <- "Power_Coal"
Raw_World_Investment_Data_5[21, "Asia Pacific"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data_5[22, "Asia Pacific"] <- "Power_Nuclear"
Raw_World_Investment_Data_5[23, "Asia Pacific"] <- "Power_Renewables"
Raw_World_Investment_Data_5[24, "Asia Pacific"] <- "Power_Fossil_Fuels_CCUS"
Raw_World_Investment_Data_5[25, "Asia Pacific"] <- "Power_Storage"
Raw_World_Investment_Data_5[2, "Asia Pacific"] <- "Total Clean Energy"
Raw_World_Investment_Data_5[6, "Asia Pacific"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data_5[26, "Asia Pacific"] <- "Power_Electricity Networks"

# pivot longer to create year column
long_Investment_data_5 <- pivot_longer(Raw_World_Investment_Data_5, 
  cols = -`Asia Pacific`,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data_5 <- pivot_wider(long_Investment_data_5,
  names_from = `Asia Pacific`, 
  values_from = Value
)

Sheet 6:

# sheet 6
Raw_World_Investment_Data_6 <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/World_Energy_Investment_raw.xlsx", sheet = 6)

# renaming variables that have the same names 
Raw_World_Investment_Data_6[3, "Middle East"] <- "Total Supply"

View(Raw_World_Investment_Data_6)

Raw_World_Investment_Data_6[8, "Middle East"] <- "Total End Use"
Raw_World_Investment_Data_6[12, "Middle East"] <- "Total Fuels"
Raw_World_Investment_Data_6[13, "Middle East"] <- "Fuels_Total Fossil Fuels"
Raw_World_Investment_Data_6[14, "Middle East"] <- "Fuels_Oil"
Raw_World_Investment_Data_6[15, "Middle East"] <- "Fuels_Gas"
Raw_World_Investment_Data_6[16, "Middle East"] <- "Fuels_Coal"
Raw_World_Investment_Data_6[17, "Middle East"] <- "Fuels_Clean Fuels"

Raw_World_Investment_Data_6[18, "Middle East"] <- "Total Power"
Raw_World_Investment_Data_6[19, "Middle East"] <- "Power_Generation"
Raw_World_Investment_Data_6[20, "Middle East"] <- "Power_Coal"
Raw_World_Investment_Data_6[21, "Middle East"] <- "Power_Oil_Natural_Gas"
Raw_World_Investment_Data_6[22, "Middle East"] <- "Power_Nuclear"
Raw_World_Investment_Data_6[23, "Middle East"] <- "Power_Renewables"
Raw_World_Investment_Data_6[24, "Middle East"] <- "Power_Fossil_Fuels_CCUS" 
Raw_World_Investment_Data_6[25, "Middle East"] <- "Power_Storage"
Raw_World_Investment_Data_6[2, "Middle East"] <- "Total Clean Energy"
Raw_World_Investment_Data_6[6, "Middle East"] <- "Supply_Electricity Networks"
Raw_World_Investment_Data_6[26, "Middle East"] <- "Power_Electricity Networks"

# pivot longer to create year column
long_Investment_data_6 <- pivot_longer(Raw_World_Investment_Data_6, 
  cols = -`Middle East`,  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_Investment_data_6 <- pivot_wider(long_Investment_data_6,
  names_from = `Middle East`, 
  values_from = Value
)
  1. Now it’s time to combine the six data frames into one, and start grouping the data into a usable format for making graphics.
# Combine  
combined_Energy_Investment <- bind_rows(wide_Investment_data, wide_Investment_data_2_2, wide_Investment_data_3, wide_Investment_data_4, wide_Investment_data_5, wide_Investment_data_6)

# Reorder columns
combined_Energy_Investment_1 <- combined_Energy_Investment %>% select(Region, everything())

# Cleaning names 
combined_Energy_Investment_2 <- combined_Energy_Investment_1 |> 
  rename(`total_billion` = `Total (billion $2022)`) |> 
  clean_names()

# Making sure the data is numeric
combined_Energy_Investment_2$total_billion <- as.numeric(combined_Energy_Investment_2$total_billion)

combined_Energy_Investment_2$total_clean_energy <- as.numeric(combined_Energy_Investment_2$total_clean_energy)

# Creating a sum total of energy investments and clean energy investments 
Energy_investment_total_by_region <- combined_Energy_Investment_2 %>%
  group_by(region) %>%
  summarise(total_investment = sum(total_billion))

Clean_energy_investment_by_region <- combined_Energy_Investment_2 %>%
  group_by(region) %>%
  summarise(total_investment = sum(total_clean_energy))

Clean_energy_investment_by_region <- Clean_energy_investment_by_region |>
  rename(total_clean_investment = total_investment)

# Merge the sum totals 
merged_total_invest_region <- merge(Clean_energy_investment_by_region, Energy_investment_total_by_region, by = c("region"), all = TRUE)

# Pivot long to get different types of investment in one column
total_invest_long_1 <- pivot_longer(merged_total_invest_region, 
                        cols = c(total_clean_investment, total_investment), 
                        names_to = "Investment_Type", 
                        values_to = "Investment_Amount")

# Graph of investment amounts per region
ggplot(total_invest_long_1, aes(x = region, y = Investment_Amount, fill = Investment_Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("total_clean_investment" = "forestgreen", "total_investment" = "darkblue"),
                    labels = c("total_clean_investment" = "Investments in Clean Energy", "total_investment" = "Total Energy Investments")) +
  labs(title = "Comparison of Energy Investments by Region (2015-2023)",
       x = "",
       y = "Investment (USD billions)",
       caption = "Data: International Energy Agency",
       fill = "Investment Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 6), 
        axis.text.y = element_text(size = 6),  # Smaller y-axis title
        axis.title.y = element_text(size = 6),  # Smaller y-axis title
        plot.title = element_text(size = 8, hjust = 0.5),
        legend.title = element_text(size = 7),  # Adjust legend title size
        legend.text = element_text(size = 6),   # Adjust legend text size
        legend.key.size = unit(0.5, "cm"),
        plot.caption = element_text(size=6))   

# Reordering regions for the graph
total_invest_long_1$region <- factor(total_invest_long_1$region,
    levels = c("Africa", "Asia Pacific", "Latin America", "Middle East","Emerging Markets", "World"))

# Write out data 
write.xlsx(total_invest_long_1, file = "C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/03_data_processed.xlsx")

World Bank Data

5. Next I cleaned the World Bank data on access to electricity. After importing in the data, I adjusted the placement of the variables, cleaned the names and pivoted the data.

# Cleaning WB Data 

#read in data
Raw_WB_Electricity_Data <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/API_EG.ELC.ACCS.ZS_DS2_en_excel_v2_42577.xls", sheet = 1)
New names:
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
• `` -> `...25`
• `` -> `...26`
• `` -> `...27`
• `` -> `...28`
• `` -> `...29`
• `` -> `...30`
• `` -> `...31`
• `` -> `...32`
• `` -> `...33`
• `` -> `...34`
• `` -> `...35`
• `` -> `...36`
• `` -> `...37`
• `` -> `...38`
• `` -> `...39`
• `` -> `...40`
• `` -> `...41`
• `` -> `...42`
• `` -> `...43`
• `` -> `...44`
• `` -> `...45`
• `` -> `...46`
• `` -> `...47`
• `` -> `...48`
• `` -> `...49`
• `` -> `...50`
• `` -> `...51`
• `` -> `...52`
• `` -> `...53`
• `` -> `...54`
• `` -> `...55`
• `` -> `...56`
• `` -> `...57`
• `` -> `...58`
• `` -> `...59`
• `` -> `...60`
• `` -> `...61`
• `` -> `...62`
• `` -> `...63`
• `` -> `...64`
• `` -> `...65`
• `` -> `...66`
• `` -> `...67`
• `` -> `...68`
# drop columns before 1990 due to missing data
Raw_WB_Electricity_Data_1 <- Raw_WB_Electricity_Data[, -c(4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34)] 

# dropping first two rows
Raw_WB_Electricity_Data_2 <- Raw_WB_Electricity_Data_1[-(1:2), ]

# making the new first row the column names 
colnames(Raw_WB_Electricity_Data_2) <- as.character(unlist(Raw_WB_Electricity_Data_2[1, ]))

# dropping extra names
Raw_WB_Electricity_Data_2 <- Raw_WB_Electricity_Data_2[-1, ]

# pivot longer to create year column
long_WB_Electricity <- pivot_longer(Raw_WB_Electricity_Data_2, 
  cols = !c(`Country Name`, `Country Code`, `Indicator Name`),  
  names_to = "Year", 
  values_to = "Value" 
)

# pivot wider to make variable columns 
wide_WB_Electricity <- pivot_wider(long_WB_Electricity,
  names_from = `Indicator Name`, 
  values_from = Value
)

# clean names 
wide_WB_Electricity_1 <- wide_WB_Electricity |> 
  rename(`iso_code` = `Country Code`) |>
  clean_names()

# write out the data
write.xlsx(wide_WB_Electricity_1, file = "C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/03_data_processed")

Our World in Data

  1. Finally, I cleaned the third and final data set from Our World in Data on different types of energy access around the world. I made sure the data classification matched the data and then merged it with the World Bank data set.
# our world in data

# read in data
Raw_OWID_Data <- read_excel("C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/00_data_raw/owid-energy-data.xlsx", sheet = 1)


# make the numbers numeric
wide_WB_Electricity_1$access_to_electricity_percent_of_population <- as.numeric(wide_WB_Electricity_1$access_to_electricity_percent_of_population)


# matching specifications 
class(Raw_OWID_Data$year)
[1] "numeric"
class(Raw_OWID_Data$iso_code)
[1] "character"
class(wide_WB_Electricity_1$year)
[1] "character"
class(wide_WB_Electricity_1$iso_code)
[1] "character"
# making year integer 
Raw_OWID_Data$year <- as.integer(Raw_OWID_Data$year)
wide_WB_Electricity_1$year <- as.integer(wide_WB_Electricity_1$year)

# merge WB and OWID
merged_WB_OWID_Energy <- merge(Raw_OWID_Data, wide_WB_Electricity_1, by = c("iso_code", "year"), all = TRUE)


merged_WB_OWID <- full_join(Raw_OWID_Data, wide_WB_Electricity_1, by = c("iso_code", "year"))

"access_to_electricity_percent_of_population" %in% names(wide_WB_Electricity_1)
[1] TRUE
"access_to_electricity_percent_of_population" %in% names(merged_WB_OWID)
[1] TRUE
# Write out the data
write.xlsx(merged_WB_OWID, file = "C:/Users/alyss/OneDrive - Johns Hopkins/Desktop/MIEF Courses/Sustainable Finance/03_data_processed")

Example Graphics

# make some graphs 

# graph on electricity % of pop
filtered_electric <- wide_WB_Electricity_1 %>%
  filter(country_name %in% c("Africa Eastern and Southern",     
"Africa Western and Central", "Middle East & North Africa", "Central Europe and the Baltics", "East Asia & Pacific (excluding high income)", "South Asia (IDA & IBRD)", "Latin America & Caribbean"))


filtered_electric_2021 <- filtered_electric %>%
  filter(year == 2021)

# graph one option, regular bar chart
ggplot(filtered_electric_2021) +
  aes(
    x = country_name,
    y = access_to_electricity_percent_of_population,
    fill = country_name
  ) +
  geom_col() +
  scale_fill_viridis_d(option = "viridis", direction = 1) +
  labs(
    x = "Region",
    y = "Access to Electricity (% of population)",
    title = "Access to Electricity in 2021 by Region",
    caption = "Data: World Bank Database",
    fill = "Region"
  ) +
  theme_minimal()+
 theme(axis.text.x = element_text(angle = 45, hjust = 1))

# graoh 2, hortizontal bar chart
ggplot(filtered_electric_2021, aes(x = country_name, y = access_to_electricity_percent_of_population, fill = country_name)) +
 geom_bar(stat = "identity", width = 0.5)+
  coord_flip() + 
  labs(title="Access to Electricity in 2021 by Region", y="Access to Electricity (% of population)", x="Region") +
  theme_minimal()