creating my dataset

Author

Lara

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(tidyr)
library(dplyr)
library(ggplot2)
library(readxl)
library(janitor)


Attache Paket: 'janitor'

Die folgenden Objekte sind maskiert von 'package:stats':

    chisq.test, fisher.test

library(here)

here() starts at C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff

library(rnaturalearth)
library(countrycode)
library(wbstats)

import data

New names:
New names:
New names:
• `` -> `...2`
• `` -> `...4`
• `` -> `...6`
• `` -> `...8`
• `` -> `...10`
• `` -> `...12`
• `` -> `...14`
• `` -> `...16`
• `` -> `...18`
• `` -> `...20`
• `` -> `...22`
• `` -> `...24`
• `` -> `...26`
• `` -> `...28`
• `` -> `...30`
• `` -> `...32`
• `` -> `...34`
• `` -> `...36`
• `` -> `...38`
• `` -> `...40`
• `` -> `...42`
• `` -> `...44`
• `` -> `...46`
• `` -> `...48`
• `` -> `...50`

rename the time variable to “country” in the OECD dataframes scope_1_2, scope_2, and scope_3 and the variable “Country Name” to “Country” in the World Bank Datasets

scope_1_2 <- scope_1_2 |>
    rename(Country = Time) |>
    slice(-1)

scope_2 <- scope_2 |>
    rename(Country = Time) |>
    slice(-1)

scope_3 <- scope_3 |>
    rename(Country = Time) |>
    slice(-1)

ppp_gdp <- ppp_gdp |>
    rename(Country = "Country Name")

population <- population |>
    rename(Country = "Country Name")

nominal_GDP <- nominal_GDP |>
    rename(Country = "Country Name")

Standardize the country names

#function to turn everything into ISO names 

country_regex_to_iso3c <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "country.name", destination = "iso3c", origin_regex = TRUE)
}

#function which turns ISO names into Country names 

iso3c_to_country_name <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "iso3c", destination = "country.name")
} 

#unique OECD group values could be interesting. 
#Group values that exist in OECD Data but cannot be matched by functions: 
#ROW: Rest of the World
#APEC: Asia-Pacific Economic Cooperation
#ASEAN: Association of South East Asian Nations
#EASIA: Eastern Asia
#EU27_2020: European Union (27 countries)
#EU28: European Union (28 countries)
#EU15: European Union (15 countries)
#EU13: EU28 excluding EU15
#EA19: Euro area (19 countries)
#G20: Group of Twenty
#ZEUR: Europe
#ZASI: East and Southeastern Asia
#ZNAM: North America
#ZSCA: South and Central America
#ZOTH: Other regions


#adjust country names in all the datasets and keep the original ones from the OECD Data

scope_3 <- scope_3 |>
  mutate(Country = case_when(Country == "T¸rkiye" ~ "Turkey",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC: Asia-Pacific Economic Cooperation, ASEAN: Association of South East Asian Nations, Data extracted on 09 Mar 2023 01:07 UTC (GMT) from OECD.Stat, EA19: Euro area (19 countries), EASIA: Eastern Asia, EU13: EU28 excluding EU15, EU15: European Union (15 countries), EU27_2020: European Union (27 countries), EU28: European Union (28 countries), G20: Group of Twenty, NONOECD: Non-OECD economies and aggregates, OECD: OECD member countries, ROW: Rest of the World, TUR: Türkiye, ZASI: East and Southeastern Asia, ZEUR: Europe, ZNAM: North America, ZOTH: Other regions, ZSCA: South and Central America
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

scope_2 <- scope_2 |>
   mutate(Country = case_when(Country == "T¸rkiye" ~ "Turkey",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC: Asia-Pacific Economic Cooperation, ASEAN: Association of South East Asian Nations, Data extracted on 09 Mar 2023 00:56 UTC (GMT) from OECD.Stat, EA19: Euro area (19 countries), EASIA: Eastern Asia, EU13: EU28 excluding EU15, EU15: European Union (15 countries), EU27_2020: European Union (27 countries), EU28: European Union (28 countries), G20: Group of Twenty, NONOECD: Non-OECD economies and aggregates, OECD: OECD member countries, ROW: Rest of the World, TUR: Türkiye, ZASI: East and Southeastern Asia, ZEUR: Europe, ZNAM: North America, ZOTH: Other regions, ZSCA: South and Central America
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

scope_1_2 <- scope_1_2 |>
   mutate(Country = case_when(Country == "T¸rkiye" ~ "Turkey",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC: Asia-Pacific Economic Cooperation, ASEAN: Association of South East Asian Nations, Data extracted on 09 Mar 2023 00:49 UTC (GMT) from OECD.Stat, EA19: Euro area (19 countries), EASIA: Eastern Asia, EU13: EU28 excluding EU15, EU15: European Union (15 countries), EU27_2020: European Union (27 countries), EU28: European Union (28 countries), G20: Group of Twenty, NONOECD: Non-OECD economies and aggregates, OECD: OECD member countries, ROW: Rest of the World, TUR: Türkiye, ZASI: East and Southeastern Asia, ZEUR: Europe, ZNAM: North America, ZOTH: Other regions, ZSCA: South and Central America
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

scope_1_1 <- scope_1_1 |>
   mutate(Country = case_when(Country == "T¸rkiye" ~ "Turkey",
                              Country == "France and Monaco" ~ "France",
                              Country == "Israel and Palestine, State of" ~ "Israel",
                              Country == "Italy, San Marino and the Holy See" ~ "Italy",
                              Country == "Serbia and Montenegro" ~ "Serbia",
                              Country == "Spain and Andorra" ~ "Spain",
                              Country == "Switzerland and Liechtenstein" ~ "Switzerland",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: International Aviation, International Shipping
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

#rename the World Bank variables to "Country" instead of "Country Name" 


ppp_gdp <-ppp_gdp |>
  mutate(Country = case_when(Country == "Turkiye" ~ "Turkey",
                             Country == "OECD members" ~ "  
OECD: OECD member countries",
                             Country == "Euro area" ~ "EA19: Euro area (19 countries)",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously:   
OECD: OECD member countries, Africa Eastern and Southern, Africa Western and Central, Arab World, Caribbean small states, Central Europe and the Baltics, Channel Islands, EA19: Euro area (19 countries), Early-demographic dividend, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Europe & Central Asia, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), European Union, Fragile and conflict affected situations, Heavily indebted poor countries (HIPC), High income, IBRD only, IDA & IBRD total, IDA blend, IDA only, IDA total, Kosovo, Late-demographic dividend, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, North America, Not classified, Other small states, Pacific island small states, Post-demographic dividend, Pre-demographic dividend, Small states, South Asia, South Asia (IDA & IBRD), Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Upper middle income, World
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

population <-population |>
   mutate(Country = case_when(Country == "Turkiye" ~ "Turkey",
                              Country == "OECD members" ~ " 
OECD: OECD member countries",
                               Country == "Euro area" ~ "EA19: Euro area (19 countries)",
                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously:   
OECD: OECD member countries, Africa Eastern and Southern, Africa Western and Central, Arab World, Caribbean small states, Central Europe and the Baltics, Channel Islands, EA19: Euro area (19 countries), Early-demographic dividend, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Europe & Central Asia, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), European Union, Fragile and conflict affected situations, Heavily indebted poor countries (HIPC), High income, IBRD only, IDA & IBRD total, IDA blend, IDA only, IDA total, Kosovo, Late-demographic dividend, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, North America, Not classified, Other small states, Pacific island small states, Post-demographic dividend, Pre-demographic dividend, Small states, South Asia, South Asia (IDA & IBRD), Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Upper middle income, World
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

nominal_GDP <-nominal_GDP |>
    mutate(Country = case_when(Country == "Turkiye" ~ "Turkey",
                               Country == "OECD members" ~ "    
OECD: OECD member countries",
                                Country == "Euro area" ~ "EA19: Euro area (19 countries)",

                             TRUE ~ Country)) |>
  mutate(iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), 
                         Country, 
                         country_regex_to_iso3c(Country)),
         country_name = ifelse(is.na(iso3c_to_country_name(iso3c)), 
                               Country, 
                               iso3c_to_country_name(iso3c)))

Warning: There were 4 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso3c = ifelse(is.na(country_regex_to_iso3c(Country)), Country,
  country_regex_to_iso3c(Country))`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously:   
OECD: OECD member countries, Africa Eastern and Southern, Africa Western and Central, Arab World, Caribbean small states, Central Europe and the Baltics, Channel Islands, EA19: Euro area (19 countries), Early-demographic dividend, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Europe & Central Asia, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), European Union, Fragile and conflict affected situations, Heavily indebted poor countries (HIPC), High income, IBRD only, IDA & IBRD total, IDA blend, IDA only, IDA total, Kosovo, Late-demographic dividend, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, North America, Not classified, Other small states, Pacific island small states, Post-demographic dividend, Pre-demographic dividend, Small states, South Asia, South Asia (IDA & IBRD), Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Upper middle income, World
ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

tidy the data (pivot longer, coerce all numbers into numeric form, select variables of interest)

#function
character_num_to_numeric <- function(character_num) {
  character_num |>
  # take out the commas
  stringr::str_remove_all(pattern = ",") |>
  # take out any blank spaces before or after the number
  stringr::str_trim() |>
  # coerce to numeric
  as.numeric()
}


nominal_GDP <- nominal_GDP |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(nominal_gdp = value) |>
  mutate(across(.cols = c(year, nominal_gdp), .fns = character_num_to_numeric))|>
  select(country_name, iso3c, year, nominal_gdp) 

population <-population |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(population = value) |>
  mutate(across(.cols = c(year, population), .fns = character_num_to_numeric)) |>
  select(country_name, iso3c, year, population) 

ppp_gdp <- ppp_gdp |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(ppp_gdp = value) |>
  mutate(across(.cols = c(year, ppp_gdp), .fns = character_num_to_numeric)) |>
  select(country_name, iso3c, year, ppp_gdp)

scope_1_1 <- scope_1_1 |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(domestic_emissions = value) |>
  mutate(across(.cols = c(year, domestic_emissions), .fns = character_num_to_numeric))|>
  select(country_name, iso3c, year, domestic_emissions)

scope_1_2 <- scope_1_2 |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(domestic_export_emissions = value) |>
  mutate(across(.cols = c(year, domestic_export_emissions), .fns = character_num_to_numeric))|>
  select(country_name, iso3c, year, domestic_export_emissions) 

scope_2 <- scope_2 |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(imported_grid_emissions = value) |>
  mutate(across(.cols = c(year, imported_grid_emissions), .fns = character_num_to_numeric))|>
  select(country_name, iso3c, year, imported_grid_emissions)

scope_3 <- scope_3 |>
  pivot_longer(cols = matches("\\d{4}"), names_to = "year") |>
  rename(imported_total_emissions = value) |>
  mutate(across(.cols = c(year, imported_total_emissions), .fns = character_num_to_numeric))|>
  select(country_name, iso3c, year, imported_total_emissions)

write to csv to export tidy data

# First dataset
write.csv(nominal_GDP, file = "nominal_GDP", row.names = FALSE)

# Second dataset
write.csv(population, file = "population", row.names = FALSE)

# Third dataset
write.csv(ppp_gdp, file = "ppp_gdp", row.names = FALSE)

# Fourth dataset
write.csv(scope_1_1, file = "scope_1_1", row.names = FALSE)

# Fifth dataset
write.csv(scope_1_2, file = "scope_1_2", row.names = FALSE)

# Sixth dataset
write.csv(scope_2, file = "scope_2", row.names = FALSE)

# Seventh dataset
write.csv(scope_3, file = "scope_3", row.names = FALSE)