OECD Data Clean-Up

Author

Jenny Park

library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(here)
here() starts at /Users/apple/R files
library(readxl)

library(rnaturalearth)
library(countrycode)
library(wbstats)
?countrycode()
view(codelist)
country_name_regex_to_iso3c <- function(country_name) {
  country_name |>
    countrycode(origin = "country.name", 
                                     destination = "iso3c",
                                     origin_regex = TRUE)
}
iso3c_to_country_name <- function(iso3c) {
  iso3c |>
  countrycode(origin = "iso3c", destination = "country.name")
}
iso3c_to_x <- purrr::partial(countrycode, origin = "iso3c")
OECD_emissions_trade_based_raw <- read_excel("OECD_data.xlsx", sheet = "OECD.Stat export", skip = 6)
New names:
• `` -> `...2`
OECD_emissions_trade_based_raw
# A tibble: 86 × 26
   Time    ...2   `1995`  `1996`  `1997`  `1998`  `1999`  `2000`  `2001`  `2002`
   <chr>   <lgl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 Country NA    NA      NA      NA      NA      NA      NA      NA      NA     
 2 WLD: W… NA     2.14e4  2.18e4  2.22e4  2.24e4  2.25e4  2.32e4  2.36e4  2.39e4
 3 OECD: … NA     1.30e4  1.33e4  1.35e4  1.37e4  1.40e4  1.46e4  1.44e4  1.44e4
 4 AUS: A… NA     2.78e2  2.84e2  2.98e2  3.22e2  3.20e2  3.24e2  3.22e2  3.40e2
 5 AUT: A… NA     8.51e1  8.94e1  8.36e1  8.57e1  8.37e1  8.39e1  8.57e1  8.44e1
 6 BEL: B… NA     1.30e2  1.32e2  1.27e2  1.31e2  1.27e2  1.27e2  1.25e2  1.21e2
 7 CAN: C… NA     4.09e2  4.17e2  4.36e2  4.39e2  4.46e2  4.57e2  4.50e2  4.71e2
 8 CHL: C… NA     3.99e1  4.62e1  5.31e1  5.66e1  5.59e1  5.39e1  5.10e1  5.37e1
 9 COL: C… NA     6.55e1  6.56e1  7.22e1  7.23e1  5.97e1  5.99e1  6.11e1  6.09e1
10 CRI: C… NA     8.05e0  7.77e0  8.17e0  9.12e0  9.01e0  8.71e0  8.84e0  9.31e0
# … with 76 more rows, and 16 more variables: `2003` <dbl>, `2004` <dbl>,
#   `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>,
#   `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>,
#   `2015` <dbl>, `2016` <dbl>, `2017` <dbl>, `2018` <dbl>
view(OECD_emissions_trade_based_raw)
#omitting data that has no value 
OECD_emissions_trade_actual <- OECD_emissions_trade_based_raw %>% 
  select(-`...2`) |> 
   filter(Time != "Country")

#renaming column to match the data 
OECD_emissions_trade_actual <- OECD_emissions_trade_actual %>% 
  rename(geography = Time)

view(OECD_emissions_trade_actual)
geography_tda <- OECD_emissions_trade_actual |> 
  select(geography)

geography_tda
# A tibble: 85 × 1
   geography                  
   <chr>                      
 1 WLD: World                 
 2 OECD: OECD member countries
 3 AUS: Australia             
 4 AUT: Austria               
 5 BEL: Belgium               
 6 CAN: Canada                
 7 CHL: Chile                 
 8 COL: Colombia              
 9 CRI: Costa Rica            
10 CZE: Czech Republic        
# … with 75 more rows
#standarizing geography to geography code
geography_tda_2 <- geography_tda |> 
  separate(col = geography, into = c("geography_code", "geography_name"), 
           sep = ":", remove = FALSE) |> 
  mutate(across(c("geography_code", "geography_name"), str_trim))

geography_tda_2
# A tibble: 85 × 3
   geography                   geography_code geography_name       
   <chr>                       <chr>          <chr>                
 1 WLD: World                  WLD            World                
 2 OECD: OECD member countries OECD           OECD member countries
 3 AUS: Australia              AUS            Australia            
 4 AUT: Austria                AUT            Austria              
 5 BEL: Belgium                BEL            Belgium              
 6 CAN: Canada                 CAN            Canada               
 7 CHL: Chile                  CHL            Chile                
 8 COL: Colombia               COL            Colombia             
 9 CRI: Costa Rica             CRI            Costa Rica           
10 CZE: Czech Republic         CZE            Czech Republic       
# … with 75 more rows
#creating country code index function
geography_tda_2 |> 
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |> 
  filter(is.na(country_name))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 05 MAR 2023 17, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA
# A tibble: 19 × 4
   geography                                             geogr…¹ geogr…² count…³
   <chr>                                                 <chr>   <chr>   <chr>  
 1 WLD: World                                            WLD     World   <NA>   
 2 OECD: OECD member countries                           OECD    OECD m… <NA>   
 3 NONOECD: Non-OECD economies and aggregates            NONOECD Non-OE… <NA>   
 4 ROW: Rest of the World                                ROW     Rest o… <NA>   
 5 APEC: Asia-Pacific Economic Cooperation               APEC    Asia-P… <NA>   
 6 ASEAN: Association of South East Asian Nations        ASEAN   Associ… <NA>   
 7 EASIA: Eastern Asia                                   EASIA   Easter… <NA>   
 8 EU27_2020: European Union (27 countries)              EU27_2… Europe… <NA>   
 9 EU28: European Union (28 countries)                   EU28    Europe… <NA>   
10 EU15: European Union (15 countries)                   EU15    Europe… <NA>   
11 EU13: EU28 excluding EU15                             EU13    EU28 e… <NA>   
12 EA19: Euro area (19 countries)                        EA19    Euro a… <NA>   
13 G20: Group of Twenty                                  G20     Group … <NA>   
14 ZEUR: Europe                                          ZEUR    Europe  <NA>   
15 ZASI: East and Southeastern Asia                      ZASI    East a… <NA>   
16 ZNAM: North America                                   ZNAM    North … <NA>   
17 ZSCA: South and Central America                       ZSCA    South … <NA>   
18 ZOTH: Other regions                                   ZOTH    Other … <NA>   
19 Data extracted on 05 Mar 2023 17:31 UTC (GMT) from O… Data e… 31 UTC… <NA>   
# … with abbreviated variable names ¹​geography_code, ²​geography_name,
#   ³​country_name
#standarizing and adding country name and ISO code further 
geography_proc <- geography_tda_2 |> 
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |>
  mutate(country_or_region = if_else(is.na(country_name), 
                                     true = "region",
                                     false = "country")) |> 
  mutate(country_name = if_else(is.na(country_name),
                                true = geography_name,
                                false = country_name)) |>
  select(geography, country_name, iso3c = geography_code, country_or_region)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 05 MAR 2023 17, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA
geography_proc
# A tibble: 85 × 4
   geography                   country_name          iso3c country_or_region
   <chr>                       <chr>                 <chr> <chr>            
 1 WLD: World                  World                 WLD   region           
 2 OECD: OECD member countries OECD member countries OECD  region           
 3 AUS: Australia              Australia             AUS   country          
 4 AUT: Austria                Austria               AUT   country          
 5 BEL: Belgium                Belgium               BEL   country          
 6 CAN: Canada                 Canada                CAN   country          
 7 CHL: Chile                  Chile                 CHL   country          
 8 COL: Colombia               Colombia              COL   country          
 9 CRI: Costa Rica             Costa Rica            CRI   country          
10 CZE: Czech Republic         Czechia               CZE   country          
# … with 75 more rows
step3_ <- geography_proc |> 
  left_join(OECD_emissions_trade_actual, by = "geography") |> 
  select(-geography)

step3_
# A tibble: 85 × 27
   count…¹ iso3c count…² `1995` `1996` `1997` `1998` `1999` `2000` `2001` `2002`
   <chr>   <chr> <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 World   WLD   region  2.14e4 2.18e4 2.22e4 2.24e4 2.25e4 2.32e4 2.36e4 2.39e4
 2 OECD m… OECD  region  1.30e4 1.33e4 1.35e4 1.37e4 1.40e4 1.46e4 1.44e4 1.44e4
 3 Austra… AUS   country 2.78e2 2.84e2 2.98e2 3.22e2 3.20e2 3.24e2 3.22e2 3.40e2
 4 Austria AUT   country 8.51e1 8.94e1 8.36e1 8.57e1 8.37e1 8.39e1 8.57e1 8.44e1
 5 Belgium BEL   country 1.30e2 1.32e2 1.27e2 1.31e2 1.27e2 1.27e2 1.25e2 1.21e2
 6 Canada  CAN   country 4.09e2 4.17e2 4.36e2 4.39e2 4.46e2 4.57e2 4.50e2 4.71e2
 7 Chile   CHL   country 3.99e1 4.62e1 5.31e1 5.66e1 5.59e1 5.39e1 5.10e1 5.37e1
 8 Colomb… COL   country 6.55e1 6.56e1 7.22e1 7.23e1 5.97e1 5.99e1 6.11e1 6.09e1
 9 Costa … CRI   country 8.05e0 7.77e0 8.17e0 9.12e0 9.01e0 8.71e0 8.84e0 9.31e0
10 Czechia CZE   country 1.05e2 1.09e2 1.06e2 9.98e1 9.61e1 1.02e2 1.03e2 1.02e2
# … with 75 more rows, 16 more variables: `2003` <dbl>, `2004` <dbl>,
#   `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>,
#   `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>,
#   `2015` <dbl>, `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, and abbreviated
#   variable names ¹​country_name, ²​country_or_region
step4_ <- step3_ |> 
  pivot_longer(cols = matches("\\d{4}"), 
               names_to = "year", 
               names_transform = as.integer)
step4_
# A tibble: 2,040 × 5
   country_name iso3c country_or_region  year  value
   <chr>        <chr> <chr>             <int>  <dbl>
 1 World        WLD   region             1995 21367.
 2 World        WLD   region             1996 21815.
 3 World        WLD   region             1997 22214.
 4 World        WLD   region             1998 22391.
 5 World        WLD   region             1999 22527.
 6 World        WLD   region             2000 23240.
 7 World        WLD   region             2001 23587.
 8 World        WLD   region             2002 23924.
 9 World        WLD   region             2003 24956.
10 World        WLD   region             2004 26130.
# … with 2,030 more rows
step4_ <- step4_ %>% 
  na.omit()

view(step4_)
oecd_metadata_ <- read_excel("OECD_data.xlsx", sheet = "OECD.Stat export",
                            range = "a3:c6", 
  col_names = c("metadata_parameter", "blank", "metadata_value")) |> 
  select(-blank) |> 
  pivot_wider(names_from = metadata_parameter, values_from = metadata_value)

oecd_metadata_
# A tibble: 1 × 4
  Indicator                                                Indus…¹ Partner Unit 
  <chr>                                                    <chr>   <chr>   <chr>
1 FD_CO2: CO2 emissions embodied in domestic final demand… DTOTAL… WLD: W… Tonn…
# … with abbreviated variable name ¹​Industry
oecd_metadata_jenny <- tribble(~my_indicator_name,
                            "CO2 from international trade")

oecd_metadata_jenny
# A tibble: 1 × 1
  my_indicator_name           
  <chr>                       
1 CO2 from international trade
step4_ |> 
  bind_cols(oecd_metadata_) |> 
  clean_names()
# A tibble: 2,016 × 9
   country_name iso3c country_or_re…¹  year  value indic…² indus…³ partner unit 
   <chr>        <chr> <chr>           <int>  <dbl> <chr>   <chr>   <chr>   <chr>
 1 World        WLD   region           1995 21367. FD_CO2… DTOTAL… WLD: W… Tonn…
 2 World        WLD   region           1996 21815. FD_CO2… DTOTAL… WLD: W… Tonn…
 3 World        WLD   region           1997 22214. FD_CO2… DTOTAL… WLD: W… Tonn…
 4 World        WLD   region           1998 22391. FD_CO2… DTOTAL… WLD: W… Tonn…
 5 World        WLD   region           1999 22527. FD_CO2… DTOTAL… WLD: W… Tonn…
 6 World        WLD   region           2000 23240. FD_CO2… DTOTAL… WLD: W… Tonn…
 7 World        WLD   region           2001 23587. FD_CO2… DTOTAL… WLD: W… Tonn…
 8 World        WLD   region           2002 23924. FD_CO2… DTOTAL… WLD: W… Tonn…
 9 World        WLD   region           2003 24956. FD_CO2… DTOTAL… WLD: W… Tonn…
10 World        WLD   region           2004 26130. FD_CO2… DTOTAL… WLD: W… Tonn…
# … with 2,006 more rows, and abbreviated variable names ¹​country_or_region,
#   ²​indicator, ³​industry
GDP_PPP_raw <- read_excel("gdp_by_ppp.xlsx", sheet = "Data", skip = 3)
library(dplyr)

#getting rid of empty data columns
GDP_PPP <- GDP_PPP_raw %>% 
  select(-matches("^196[0-9]$"), -matches("^197[0-9]$"), -matches("^198[0-9]$"))

#country year and GDP PPP are all columns 
GDP_PPP <- GDP_PPP %>% 
  pivot_longer("1990":"2021",
               names_to = "year",
               values_to = "GDP_PPP_dollars")

GDP_PPP <- GDP_PPP %>% 
  na.omit()

GDP_PPP <- GDP_PPP %>% 
  select(-c("Indicator Name", "Indicator Code")) %>%
  rename(country_name = "Country Name", iso3c = "Country Code")

view(GDP_PPP)
# Convert the year column in GDP_PPP to integer bc it said an error when i tried it without 
GDP_PPP <- GDP_PPP %>%
  mutate(year = as.integer(year))

trade_oecd_gdp_ppp  <- full_join(step4_, GDP_PPP, by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

trade_oecd_gdp_ppp <- trade_oecd_gdp_ppp %>% 
  na.omit()

trade_oecd_gdp_ppp <- trade_oecd_gdp_ppp %>% 
  rename("emission_based_on_trade" = "value")
view(trade_oecd_gdp_ppp)
#saving as csv file
write.csv(trade_oecd_gdp_ppp, "trade_oecd_gdp_ppp.csv", row.names = FALSE)