HW- find data and tidying

HW: Find Your Data and Tidying

library(readxl)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(writexl)
library(dplyr)
library(countrycode)
library(here) # for using relative file paths

here() starts at /Users/ciel.wang/Desktop/JH

read the excel files:

setwd("/Users/ciel.wang/Desktop/JH/R/data-raw")

path_to_sheet <- here("data-raw", "sovereign_debt_metric.xlsx")

sovereign_ems <- read_xlsx("sovereign_debt_metric.xlsx", 
                               sheet = 'emission_by_country'  )
ppp_gdp <- read_xlsx("sovereign_debt_metric.xlsx", 
                     sheet = 'PPP_GDP'  )
debt <- read_xlsx("sovereign_debt_metric.xlsx", 
                              sheet = 'outstanding_debt'  )

design function to standardize country name and code:

country_regex_to_iso3c <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "country.name", destination = "iso3c", origin_regex = TRUE)
}

iso3c_to_country_name <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "iso3c", destination = "country.name")
}

###pivot longer each data set and select variables needed
sovereign_emssion_long <- sovereign_ems |> 
  pivot_longer(
    cols= starts_with('Y_'),
    names_to = 'year',
    values_to = 'sovereign_emission'
  ) |>
  mutate(year=parse_number(year)) |>
    select('Country_code_A3','Name', 'year','sovereign_emission')
sovereign_emssion_long

# A tibble: 11,819 × 4
   Country_code_A3 Name   year sovereign_emission
   <chr>           <chr> <dbl>              <dbl>
 1 ABW             Aruba  1970               45.2
 2 ABW             Aruba  1971               50.1
 3 ABW             Aruba  1972               60.9
 4 ABW             Aruba  1973               65.8
 5 ABW             Aruba  1974               65.1
 6 ABW             Aruba  1975               79.1
 7 ABW             Aruba  1976               78.2
 8 ABW             Aruba  1977               88.9
 9 ABW             Aruba  1978               93.7
10 ABW             Aruba  1979               97.7
# ℹ 11,809 more rows

PPP_GDP_long <- ppp_gdp |> 
  pivot_longer(
    cols= !c("Country Name", "Country Code", "Indicator Name", "Indicator Code"),
    names_to = 'year',
    values_to = 'PPP_GDP',
    values_drop_na = TRUE
  ) |>
 rename(Name = "Country Name") |>
  mutate(year = as.double(year)) |>
  select("Country Code",Name, year, PPP_GDP)
PPP_GDP_long

# A tibble: 7,730 × 4
   `Country Code` Name   year     PPP_GDP
   <chr>          <chr> <dbl>       <dbl>
 1 ABW            Aruba  1990 1363676996.
 2 ABW            Aruba  1991 1522053333.
 3 ABW            Aruba  1992 1648312495.
 4 ABW            Aruba  1993 1810691879.
 5 ABW            Aruba  1994 2001077313.
 6 ABW            Aruba  1995 2095077029.
 7 ABW            Aruba  1996 2158735270.
 8 ABW            Aruba  1997 2350707816.
 9 ABW            Aruba  1998 2424518584.
10 ABW            Aruba  1999 2489125252.
# ℹ 7,720 more rows

outstanding_debt_long <- debt |>
  mutate(across(starts_with(c("19", "20")), as.character)) |>
  pivot_longer(
    cols = !c("Country Name", 'Country Code', 'Counterpart-Area Name',
              'Counterpart-Area Code', 'Series Name', 'Series Code'),
    names_to = 'year',
    values_to = 'outstanding_debt',
    values_transform = list(year = as.character)
  ) |>
  rename(Name = "Country Name") |>
  mutate(outstanding_debt = na_if(outstanding_debt, "..")) |>
  filter(!is.na(outstanding_debt)) |>
  mutate(year = substr(year, 1, 4),
         year = as.double(year),
         outstanding_debt = as.double(outstanding_debt)) |>
  select("Country Code", Name, year, 'outstanding_debt')  
outstanding_debt_long

# A tibble: 6,325 × 4
   `Country Code` Name         year outstanding_debt
   <chr>          <chr>       <dbl>            <dbl>
 1 AFG            Afghanistan  2006       919625563.
 2 AFG            Afghanistan  2007      1902213499.
 3 AFG            Afghanistan  2008      1994109003 
 4 AFG            Afghanistan  2009      2105711788.
 5 AFG            Afghanistan  2010      1975547214.
 6 AFG            Afghanistan  2011      2032464250.
 7 AFG            Afghanistan  2012      2079786013.
 8 AFG            Afghanistan  2013      2106231492.
 9 AFG            Afghanistan  2014      2020951889.
10 AFG            Afghanistan  2015      1989954793.
# ℹ 6,315 more rows

Try to standardize country variables :

#### standardzied country names and codes
#sovereign_emssion_long |>
#  mutate(iso3c = country_regex_to_iso3c(Name))
# Warning: the following five country names cannot be parsed through the countrycode package,and these are not found in the next two data sets. 
'Int. Aviation' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Int. Aviation

[1] NA

'Int. Shipping' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Int. Shipping

[1] NA

'Netherlands Antilles' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Netherlands Antilles

[1] NA

'Serbia and Montenegro' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Serbia and Montenegro

[1] NA

'Virgin Islands_USA ' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Virgin Islands_USA

[1] NA

#PPP_GDP_long |>
#  mutate(iso3c = country_regex_to_iso3c(Name))
#########Unfortunately a lot of the variables cannot be parsed, as they are region names:      Africa Eastern and Southern, Africa Western and Central, Arab World, Caribbean small states, Central Europe and the Baltics, Early-demographic dividend, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Euro area, Europe & Central Asia, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), European Union, Fragile and conflict affected situations, Heavily indebted poor countries (HIPC), High income, IBRD only, IDA & IBRD total, IDA blend, IDA only, IDA total, Kosovo, Late-demographic dividend, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, North America, OECD members, Other small states, Pacific island small states, Post-demographic dividend, Pre-demographic dividend, Small states, South Asia, South Asia (IDA & IBRD), Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Upper middle income, World 


#outstanding_debt_long |>
#  mutate(iso3c = country_regex_to_iso3c(Name))
#Same as before, there are a lof of unrexignized regional variables:East Asia & Pacific (excluding high income), Europe & Central Asia (excluding high income), IDA only, IDA total, Kosovo, Latin America & Caribbean (excluding high income), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa (excluding high income), Middleincome, South Asia, Sub-Saharan Africa (excluding high income), Upper middle income

'Kosovo' |> country_regex_to_iso3c()

Warning: Some values were not matched unambiguously: Kosovo

[1] NA

#This variable is the only unmatched country variable from the last two data set, but it is not included in the sovereign_emission_long data set.

#Therefore, we have to drop these variables then standardize


sovereign_emission_long2 <- sovereign_emssion_long |>
  filter(Name != "Int. Aviation" & 
           Name != "Int. Shipping" & 
           Name != "Netherlands Antilles" & 
           Name != "Serbia and Montenegro" & 
           Name != "Virgin Islands_USA") |>
  mutate(iso3c = country_regex_to_iso3c(Name),
         country_name = iso3c_to_country_name(iso3c)) |>
  select("iso3c", "country_name", "year", "sovereign_emission")

PPP_GDP_long2 <- PPP_GDP_long |>
  filter(
    Name != "Africa Eastern and Southern",
    Name != "Africa Western and Central",
    Name != "Arab World",
    Name != "Caribbean small states",
    Name != "Central Europe and the Baltics",
    Name != "Early-demographic dividend",
    Name != "East Asia & Pacific",
    Name != "East Asia & Pacific (excluding high income)",
    Name != "East Asia & Pacific (IDA & IBRD countries)",
    Name != "Euro area",
    Name != "Europe & Central Asia",
    Name != "Europe & Central Asia (excluding high income)",
    Name != "Europe & Central Asia (IDA & IBRD countries)",
    Name != "European Union",
    Name != "Fragile and conflict affected situations",
    Name != "Heavily indebted poor countries (HIPC)",
    Name != "High income",
    Name != "IBRD only",
    Name != "IDA & IBRD total",
    Name != "IDA blend",
    Name != "IDA only",
    Name != "IDA total",
    Name != "Kosovo",
    Name != "Late-demographic dividend",
    Name != "Latin America & Caribbean",
    Name != "Latin America & Caribbean (excluding high income)",
    Name != "Latin America & the Caribbean (IDA & IBRD countries)",
    Name != "Least developed countries: UN classification",
    Name != "Low & middle income",
    Name != "Low income",
    Name != "Lower middle income",
    Name != "Middle East & North Africa",
    Name != "Middle East & North Africa (excluding high income)",
    Name != "Middle East & North Africa (IDA & IBRD countries)",
    Name != "Middle income",
    Name != "North America",
    Name != "OECD members",
    Name != "Other small states",
    Name != "Pacific island small states",
    Name != "Post-demographic dividend",
    Name != "Pre-demographic dividend",
    Name != "Small states",
    Name != "South Asia",
    Name != "South Asia (IDA & IBRD)",
    Name != "Sub-Saharan Africa",
    Name != "Sub-Saharan Africa (excluding high income)",
    Name != "Sub-Saharan Africa (IDA & IBRD countries)",
    Name != "Upper middle income",
    Name != "World"
  )|>
  mutate(iso3c = country_regex_to_iso3c(Name),
         country_name = iso3c_to_country_name(iso3c))|>
  select("iso3c", "country_name", "year", "PPP_GDP")

outstanding_debt_long2 <- outstanding_debt_long |>
  filter(
    Name != "East Asia & Pacific (excluding high income)",
    Name != "Europe & Central Asia (excluding high income)",
    Name != "IDA only",
    Name != "IDA total",
    Name != "Kosovo",
    Name != "Latin America & Caribbean (excluding high income)",
    Name != "Least developed countries: UN classification",
    Name != "Low & middle income",
    Name != "Low income",
    Name != "Lower middle income",
    Name != "Middle East & North Africa (excluding high income)",
    Name != "Middle income",
    Name != "South Asia",
    Name != "Sub-Saharan Africa (excluding high income)",
    Name != "Upper middle income"
  ) |>
  mutate(iso3c = country_regex_to_iso3c(Name),
         country_name = iso3c_to_country_name(iso3c))|>
  select("iso3c", "country_name", "year", "outstanding_debt")



sovereign_emission_long2

# A tibble: 11,554 × 4
   iso3c country_name  year sovereign_emission
   <chr> <chr>        <dbl>              <dbl>
 1 ABW   Aruba         1970               45.2
 2 ABW   Aruba         1971               50.1
 3 ABW   Aruba         1972               60.9
 4 ABW   Aruba         1973               65.8
 5 ABW   Aruba         1974               65.1
 6 ABW   Aruba         1975               79.1
 7 ABW   Aruba         1976               78.2
 8 ABW   Aruba         1977               88.9
 9 ABW   Aruba         1978               93.7
10 ABW   Aruba         1979               97.7
# ℹ 11,544 more rows

PPP_GDP_long2

# A tibble: 6,151 × 4
   iso3c country_name  year     PPP_GDP
   <chr> <chr>        <dbl>       <dbl>
 1 ABW   Aruba         1990 1363676996.
 2 ABW   Aruba         1991 1522053333.
 3 ABW   Aruba         1992 1648312495.
 4 ABW   Aruba         1993 1810691879.
 5 ABW   Aruba         1994 2001077313.
 6 ABW   Aruba         1995 2095077029.
 7 ABW   Aruba         1996 2158735270.
 8 ABW   Aruba         1997 2350707816.
 9 ABW   Aruba         1998 2424518584.
10 ABW   Aruba         1999 2489125252.
# ℹ 6,141 more rows

outstanding_debt_long2

# A tibble: 5,569 × 4
   iso3c country_name  year outstanding_debt
   <chr> <chr>        <dbl>            <dbl>
 1 AFG   Afghanistan   2006       919625563.
 2 AFG   Afghanistan   2007      1902213499.
 3 AFG   Afghanistan   2008      1994109003 
 4 AFG   Afghanistan   2009      2105711788.
 5 AFG   Afghanistan   2010      1975547214.
 6 AFG   Afghanistan   2011      2032464250.
 7 AFG   Afghanistan   2012      2079786013.
 8 AFG   Afghanistan   2013      2106231492.
 9 AFG   Afghanistan   2014      2020951889.
10 AFG   Afghanistan   2015      1989954793.
# ℹ 5,559 more rows

Join the data frames by country and year and calculate the financed emission based on the sovereign debt metric:

sovereign_debt_emission <- left_join(sovereign_emission_long2, PPP_GDP_long2, 
                                     by = c("iso3c" = "iso3c", "year" = "year", "country_name" = "country_name")) |>
  left_join(outstanding_debt_long2, 
            by = c("iso3c" = "iso3c", "year" = "year", "country_name" = "country_name")) |>
  mutate(
    sovereign_emission = as.numeric(sovereign_emission),
    outstanding_debt = as.numeric(outstanding_debt),
    PPP_GDP = as.numeric(PPP_GDP),
    financed_emission = sovereign_emission * outstanding_debt / PPP_GDP
  ) |>
  select("iso3c", "country_name", "year", "sovereign_emission", "outstanding_debt",
         "PPP_GDP", "financed_emission") 

sovereign_debt_emission

# A tibble: 11,554 × 7
   iso3c country_name  year sovereign_emission outstanding_debt PPP_GDP
   <chr> <chr>        <dbl>              <dbl>            <dbl>   <dbl>
 1 ABW   Aruba         1970               45.2               NA      NA
 2 ABW   Aruba         1971               50.1               NA      NA
 3 ABW   Aruba         1972               60.9               NA      NA
 4 ABW   Aruba         1973               65.8               NA      NA
 5 ABW   Aruba         1974               65.1               NA      NA
 6 ABW   Aruba         1975               79.1               NA      NA
 7 ABW   Aruba         1976               78.2               NA      NA
 8 ABW   Aruba         1977               88.9               NA      NA
 9 ABW   Aruba         1978               93.7               NA      NA
10 ABW   Aruba         1979               97.7               NA      NA
# ℹ 11,544 more rows
# ℹ 1 more variable: financed_emission <dbl>

Note that emission for some of the years are NA due to missing data from different data sources, we thus drop the missing ones and give meaning labels to the data set:

sovereign_debt_dropped <- na.omit(sovereign_debt_emission) |>
  rename(
    Country_Code = iso3c,
    Country_Name = country_name,
    Year = year,
    Sovereign_Emission = sovereign_emission,
    Outstanding_Debt = outstanding_debt,
    PPP_GDP = PPP_GDP,
    Financed_Emission = financed_emission
  )

sovereign_debt_dropped

# A tibble: 3,649 × 7
   Country_Code Country_Name  Year Sovereign_Emission Outstanding_Debt   PPP_GDP
   <chr>        <chr>        <dbl>              <dbl>            <dbl>     <dbl>
 1 AFG          Afghanistan   2006             18461.       919625563.   2.85e10
 2 AFG          Afghanistan   2007             19618.      1902213499.   3.33e10
 3 AFG          Afghanistan   2008             23090.      1994109003    3.53e10
 4 AFG          Afghanistan   2009             26313.      2105711788.   4.31e10
 5 AFG          Afghanistan   2010             30191.      1975547214.   4.99e10
 6 AFG          Afghanistan   2011             34470.      2032464250.   5.12e10
 7 AFG          Afghanistan   2012             32465.      2079786013.   5.97e10
 8 AFG          Afghanistan   2013             30900.      2106231492.   6.50e10
 9 AFG          Afghanistan   2014             31214.      2020951889.   6.91e10
10 AFG          Afghanistan   2015             31208.      1989954793.   7.18e10
# ℹ 3,639 more rows
# ℹ 1 more variable: Financed_Emission <dbl>

Finally, save the processed data:

write.csv(sovereign_debt_dropped, file = "sovereign_debt_dropped.csv", row.names = FALSE)

The financed emission of sovereign debt is somehow not showing up after rendering, but it should be like this: