Assignment 2 - Data cleaning and Exploration

Author

Shuhan

Load all the necessary packages

Show the code

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Show the code

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

Show the code

library(here)

here() starts at /Users/helenayue/sais_sustainable_finance

Show the code

library(readxl)
library(tidyr)
library(dplyr)

library(rnaturalearth)
library(countrycode)
library(wbstats)
library(ggplot2)
library(gganimate)
#install.packages('gganimate')
library(gapminder)
#install.packages('gifski')
library(gifski)

Read the Scope 1 emission data

Show the code

raw_domestic_scope_1 = read_excel("/Users/helenayue/Desktop/Sustainable Finance/historical_emissions.xlsx", col_names = TRUE)

raw_domestic_scope_1

# A tibble: 193 × 31
   Country  Unit  `2018` `2017` `2016` `2015` `2014` `2013` `2012` `2011` `2010`
   <chr>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 China    MtCO…  9853.  9446.  9225.  9151.  9296.  9274.  8832.  8572.  8143.
 2 United … MtCO…  4744.  4588.  4663.  4571.  4688.  4673.  4537.  4755.  4990.
 3 India    MtCO…  2418.  2286.  2161.  2093.  2082.  1908.  1845.  1692.  1665.
 4 Indones… MtCO…  1263.  1066.  1056.  1572.  1533.  1244.  1299.  1281.   772.
 5 Russia   MtCO…  1102.  1036.  1012.   905.   924.   945.   988.  1011.   849.
 6 Japan    MtCO…  1084.  1123.  1136.  1130.  1167.  1213.  1204.  1164.  1058.
 7 Brazil   MtCO…   818.   841.   831.   747.   774.   749.   716.   680.  1499.
 8 Germany  MtCO…   678.   703.   718.   713.   707.   747.   731.   717.   755.
 9 Iran     MtCO…   637.   626.   607.   599.   605.   584.   559.   552.   497.
10 Canada   MtCO…   615.   603.   592.   694.   697.   691.   681.   684.   838.
# … with 183 more rows, and 20 more variables: `2009` <dbl>, `2008` <dbl>,
#   `2007` <dbl>, `2006` <dbl>, `2005` <dbl>, `2004` <dbl>, `2003` <dbl>,
#   `2002` <dbl>, `2001` <dbl>, `2000` <dbl>, `1999` <dbl>, `1998` <dbl>,
#   `1997` <dbl>, `1996` <dbl>, `1995` <dbl>, `1994` <dbl>, `1993` <dbl>,
#   `1992` <dbl>, `1991` <dbl>, `1990` <chr>

Omit data entries

Show the code

raw_domestic_scope_1 <- raw_domestic_scope_1 |>
  na.omit()

Change the emissions from catgorical to numerical for better data dorting later.

Show the code

raw_domestic_scope_1$"1990" <- as.numeric(as.character(raw_domestic_scope_1$"1990"))  # Convert one variable to numeric

Warning: NAs introduced by coercion

Make the dataset longer

Show the code

raw_domestic_scope_1 <- raw_domestic_scope_1 %>% 
  pivot_longer("1990":"2018",
    names_to = "year",
    values_to = "scope_1_emissions"
  )

Rename the data for standardization

Show the code

raw_domestic_scope_1 <- raw_domestic_scope_1 |>
  rename("country_name" = "Country")

Create country code index function and iso3c index function

Show the code

country_name_regex_to_iso3c <- function(country_name) {
  country_name |>
    countrycode(origin = "country.name", 
                                     destination = "iso3c",
                                     origin_regex = TRUE)
}

iso3c_to_country_name <- function(iso3c) {
  iso3c |>
  countrycode(origin = "iso3c", destination = "country.name")
}

Add a standardized country name and ISO code for each country.

Show the code

raw_domestic_scope_1 <- raw_domestic_scope_1 %>% 
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = country_name_regex_to_iso3c(country_name)`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Micronesia

Download data for export emissions for each country from 1995-2018

Show the code

scope_1_export = read_excel("/Users/helenayue/Desktop/Sustainable Finance/oecd_export.xlsx", col_names = TRUE)

Tidy the data (to make the dataset longer)

Show the code

scope_1_export <- scope_1_export |> 
  pivot_longer("1995":"2018",
    names_to = "year",
    values_to = "export_emissions"
  )

Standardize country names

Show the code

scope_1_export <- scope_1_export |>
  rename("country_name" = "Country")

scope_1_export <- scope_1_export |>
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = country_name_regex_to_iso3c(country_name)`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Rest of the World

Show the code

scope_1_export <- scope_1_export|>
  na.omit()

Combine domestic emissions and export emissions to generate the complete scope 1 emissions data.

Show the code

scope_1_complete <- 
  full_join(raw_domestic_scope_1, scope_1_export, by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

Download data for scope 2 emissions for each country from 1995-2018

Show the code

scope_2 = read_excel("/Users/helenayue/Desktop/Sustainable Finance/scope_2.xlsx", col_names = TRUE)

New names:
• `` -> `...1`

Tidy the data

Show the code

scope_2 <- scope_2 |> 
  pivot_longer("1995":"2018",
    names_to = "year",
    values_to = "scope_2_emissions"
  )

Standardize the country name

Show the code

scope_2 <- scope_2 |> 
  rename("country_name" = "...1")

scope_2 <- scope_2 |>
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = country_name_regex_to_iso3c(country_name)`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Türkiye

Omit missing data

Show the code

scope_2 <- scope_2 |>
  na.omit()

Read data for scope 3 emissions for each country from 1995-2018 (here I did the manual calculation for scope 3 emissions in excel).

Show the code

scope_3 = read_excel("/Users/helenayue/Desktop/Sustainable Finance/scope_3.xlsx", col_names = TRUE)

Tidy the data

Show the code

scope_3 <- scope_3 |> 
  pivot_longer("1995":"2018",
    names_to = "year",
    values_to = "scope_3_emissions"
  )

Standardize the country name

Show the code

scope_3 <- scope_3 |> 
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = country_name_regex_to_iso3c(country_name)`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Türkiye

Omit missing data

Show the code

scope_3 <- scope_3 |>
  na.omit()

Combine scope 1 and scope 2 data

Show the code

combine_scopes_1_2 <- 
  full_join(scope_1_complete, scope_2,  by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

Combine scope 1, 2 and 3 data

Show the code

complete_all_scopes_data <-
    full_join(combine_scopes_1_2, scope_3,  by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

Clean dataset to remove those with years without all kinds of emissions

Show the code

complete_ready_scopes_data <- complete_all_scopes_data %>% 
  na.omit

complete_ready_scopes_data

# A tibble: 1,512 × 8
   country_name Unit   year  scope_1_emissions iso3c export_em…¹ scope…² scope…³
   <chr>        <chr>  <chr>             <dbl> <chr>       <dbl>   <dbl>   <dbl>
 1 China        MtCO₂e 1995              2739. CHN          442.    1.10    116.
 2 China        MtCO₂e 1996              2721. CHN          459.    1.03    123.
 3 China        MtCO₂e 1997              2785. CHN          500.    1.14    136.
 4 China        MtCO₂e 1998              2887. CHN          541.    1.09    141.
 5 China        MtCO₂e 1999              2804. CHN          509.    1.05    165.
 6 China        MtCO₂e 2000              2997. CHN          617.    1.39    216.
 7 China        MtCO₂e 2001              3197. CHN          612.    1.45    236.
 8 China        MtCO₂e 2002              3478. CHN          700.    3.56    266.
 9 China        MtCO₂e 2003              4084. CHN          986.    4.08    339.
10 China        MtCO₂e 2004              4793. CHN         1342.    3.87    404.
# … with 1,502 more rows, and abbreviated variable names ¹export_emissions,
#   ²scope_2_emissions, ³scope_3_emissions

Generate a column to calculate the sum of scope 1, 2, and 3 emissions

Show the code

complete_ready_scopes_data <- complete_ready_scopes_data %>% 
  mutate("consumption_emissions" = scope_1_emissions + scope_2_emissions + scope_3_emissions - export_emissions)

Read GDP PPP data set

Show the code

ppp_gdp_data = read_excel("/Users/helenayue/Desktop/Sustainable Finance/ppp_gdp.xlsx", col_names = TRUE)

Tidy the data set

Show the code

ppp_gdp_data <- ppp_gdp_data %>% 
  pivot_longer("1995":"2018",
    names_to = "year",
    values_to = "PPP_GDP_dollars"
  )

Omit missing data

Show the code

ppp_gdp_data <- ppp_gdp_data %>% 
  na.omit()

Standardize column names

Show the code

ppp_gdp_data <- ppp_gdp_data %>% 
  rename("country_name" = "Country Name")

ppp_gdp_data <- ppp_gdp_data |>
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = country_name_regex_to_iso3c(country_name)`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Africa Eastern and Southern, Africa Western and Central, Arab World, Caribbean small states, Central Europe and the Baltics, Channel Islands, Early-demographic dividend, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Euro area, Europe & Central Asia, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), European Union, Fragile and conflict affected situations, Heavily indebted poor countries (HIPC), High income, IBRD only, IDA & IBRD total, IDA blend, IDA only, IDA total, Kosovo, Late-demographic dividend, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Least developed countries: UN classification, Low & middle income, Low income, Lower middle income, Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, North America, Not classified, OECD members, Other small states, Pacific island small states, Post-demographic dividend, Pre-demographic dividend, Small states, South Asia, South Asia (IDA & IBRD), Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Turkiye, Upper middle income, World

Add PPP-GDP data to data set with scope 1-3 emissions

Show the code

scopes_GDP_PPP <-
   full_join(complete_all_scopes_data, ppp_gdp_data, by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

Warning in full_join(complete_all_scopes_data, ppp_gdp_data, by = c(year = "year", : Each row in `x` is expected to match at most 1 row in `y`.
ℹ Row 5313 of `x` matches multiple rows.
ℹ If multiple matches are expected, set `multiple = "all"` to silence this
  warning.

Read nominal GDP data set

Show the code

nominal_data = read_excel("/Users/helenayue/Desktop/Sustainable Finance/nominal_gdp.xlsx", col_names = TRUE)

Tidy the data set

Show the code

nominal_data <- nominal_data %>% 
  pivot_longer("1995":"2018",
    names_to = "year",
    values_to = "nom_GDP_dollars"
  )

Omit missing data

Show the code

nominal_data <- nominal_data %>% 
  na.omit()

Standardize column names

Show the code

nominal_data <- nominal_data %>% 
  rename("country_name" = "Country Name")

nominal_data <- nominal_data %>% 
  rename("iso3c" = "Country Code")

ppp_gdp_data <- ppp_gdp_data |>
  mutate(iso3c = country_name_regex_to_iso3c(country_name)) |>
  mutate(country_name = iso3c_to_country_name(iso3c))

Add PPP-GDP data to data set with scope 1-3 emissions

Show the code

scopes_GDP_PPP_nominal <-
   full_join(scopes_GDP_PPP, nominal_data, by = c("year" = "year", "country_name" = "country_name", "iso3c" = "iso3c"))

Clean the data set

Show the code

clean_data <- scopes_GDP_PPP_nominal %>% 
  na.omit()

Here I have all the necessary information and a clean dataset:)