Data Cleaning

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidyquant)

## 载入需要的程辑包：lubridate
## 
## 载入程辑包：'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## 载入需要的程辑包：PerformanceAnalytics
## 载入需要的程辑包：xts
## 载入需要的程辑包：zoo
## 
## 载入程辑包：'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## 载入程辑包：'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## 载入程辑包：'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## 载入需要的程辑包：quantmod
## 载入需要的程辑包：TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(lubridate)


library(plotly)

## 
## 载入程辑包：'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(esquisse)
library(here)

## here() starts at /Users/irisyan/Sustainable finance final project Jing Yan

library(janitor)

## 
## 载入程辑包：'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(ggthemes)
library(ggrepel)
library(gt)

library(countrycode)
library(wbstats)

folder_path <- partial(here, "00_data_raw")

folder_path() %>% list.files()

## [1] "country_features_2022-10.csv"    "emissions_dataset_full.csv"     
## [3] "emissions_dataset.csv"           "Green_bond_full_dataset.csv"    
## [5] "imf_weo_by_country_2022_oct.csv"

emissions_dataset <- folder_path("emissions_dataset.csv") %>%
  read_csv()

## Rows: 2820 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): country_name, iso3c, em_dm
## dbl (28): year, gdp_usd_current_prices, gdp_ppp_current_prices, gdp_pc_usd_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

emissions_dataset_full <- folder_path("emissions_dataset_full.csv") %>%
  read_csv()

## Rows: 6702 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): iso3c
## dbl (30): year, gdp_usd_current_prices, gdp_ppp_current_prices, gdp_pc_usd_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

imf_weo_by_country_2022 <- folder_path("imf_weo_by_country_2022_oct.csv") %>%
  read_csv()

## Rows: 414000 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): country_name, iso3c, short_name_unit, short_name, short_unit, categ...
## dbl (2): year, value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

country_features_2022 <- folder_path("country_features_2022-10.csv") %>%
  read_csv()

## Rows: 217 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): country_name, iso3c, wb_income_group, wb_region
## dbl (4): debt_gross_percent_of_gdp, nominal_gdp_bn_ppp, nominal_gdp_per_capi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Green_bond_full_dataset <- folder_path("Green_bond_full_dataset.csv") %>%
  read_csv()

## Rows: 5006 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): description, maturity_date, coupon_class, currency, ESG_bond_type,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

?countrycode

country_name_regex_to_iso3c <- function(country_name) {
  country_name %>%
    countrycode(origin = "country.name", 
                                     destination = "iso3c",
                                     origin_regex = TRUE)
}

Green_bond_full_dataset$iso3c <- country_name_regex_to_iso3c(Green_bond_full_dataset$country_of_issue)

emissions_dataset_full

imf_weo_by_country_2022

country_features_2022

Green_bond_full_dataset

missing_green_bond_data <- Green_bond_full_dataset %>%
  filter(is.na(amount_outstanding_usd)) %>%
  select(country_of_issue, amount_outstanding_usd, coupon_class, ESG_bond_type, issued_amount_usd, TRBC_sector, iso3c) %>%
  unique() 

missing_green_bond_data

library(visdat)
vis_miss(Green_bond_full_dataset)

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

vis_miss(emissions_dataset_full)

vis_miss(country_features_2022)

missing_emissions_data <- emissions_dataset_full %>%
  filter(is.na(consumption_co2)) %>%
  select(iso3c, consumption_co2) %>%
  unique() 

missing_emissions_data

write.csv(emissions_dataset,file = "/Users/irisyan/Sustainable finance final project Jing Yan/03_data_processed/emissions_dataset.csv")
write.csv(emissions_dataset_full,file = "/Users/irisyan/Sustainable finance final project Jing Yan/03_data_processed/emissions_dataset_full.csv")
write.csv(imf_weo_by_country_2022,file = "/Users/irisyan/Sustainable finance final project Jing Yan/03_data_processed/imf_weo_by_country_2022_oct.csv")
write.csv(country_features_2022,file = "/Users/irisyan/Sustainable finance final project Jing Yan/03_data_processed/country_features_2022-10.csv")
write.csv(Green_bond_full_dataset,file = "/Users/irisyan/Sustainable finance final project Jing Yan/03_data_processed/Green_bond_full_dataset.csv")