Data Cleaning

rm(list = ls())

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.3

## Warning: package 'tibble' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.3

## Warning: package 'stringr' was built under R version 4.1.3

## Warning: package 'forcats' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## Warning: package 'here' was built under R version 4.1.3

## here() starts at C:/Users/arink/OneDrive/SAIS/Fall 2022/Sustainable Finance/Final Project

library(janitor)

## Warning: package 'janitor' was built under R version 4.1.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(countrycode)

## Warning: package 'countrycode' was built under R version 4.1.3

library(readxl)
library(lubridate)

## Warning: package 'lubridate' was built under R version 4.1.3

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(forcats)
library(esquisse)

## Warning: package 'esquisse' was built under R version 4.1.3

options(scipen=10) # forces regular notation vs scientific notation (ie5)```

raw_clim_exp <- here("00_data_raw", "oecd", "SGCFD_03112022223728001.csv") %>% read_csv()

## Rows: 27281 Columns: 15
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (10): COUNTRY, Country, FORM, Form of the State, SECTOR, Sector, INDICAT...
## dbl  (3): YEAR, Year, Value
## lgl  (2): Flag Codes, Flags
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

#raw_clim_inv <- here("00_data_raw", "oecd", "SGCFD_03112022223749048.csv") %>% read_csv() 

raw_country_feat <- here("00_data_raw", "country_features_2022-10.csv") %>% read_csv()

## Rows: 217 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): country_name, iso3c, wb_income_group, wb_region
## dbl (4): debt_gross_percent_of_gdp, nominal_gdp_bn_ppp, nominal_gdp_per_capi...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

raw_imf_weo <- here("00_data_raw", "imf_weo_by_country_2022_oct.csv") %>% read_csv()

## Rows: 414000 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): country_name, iso3c, short_name_unit, short_name, short_unit, categ...
## dbl (2): year, value
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

urb_lvl <-here("00_data_raw", "API_SP.URB.TOTL.IN.ZS_DS2_en_csv_v2_4670460.csv") %>% read_csv()

## New names:
## Rows: 270 Columns: 66
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (4): Data Source, World Development Indicators, ...3, ...4 dbl (62): ...5,
## ...6, ...7, ...8, ...9, ...10, ...11, ...12, ...13, ...14, ....
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...3`
## * `` -> `...4`
## * `` -> `...5`
## * `` -> `...6`
## * `` -> `...7`
## * `` -> `...8`
## * `` -> `...9`
## * `` -> `...10`
## * `` -> `...11`
## * `` -> `...12`
## * `` -> `...13`
## * `` -> `...14`
## * `` -> `...15`
## * `` -> `...16`
## * `` -> `...17`
## * `` -> `...18`
## * `` -> `...19`
## * `` -> `...20`
## * `` -> `...21`
## * `` -> `...22`
## * `` -> `...23`
## * `` -> `...24`
## * `` -> `...25`
## * `` -> `...26`
## * `` -> `...27`
## * `` -> `...28`
## * `` -> `...29`
## * `` -> `...30`
## * `` -> `...31`
## * `` -> `...32`
## * `` -> `...33`
## * `` -> `...34`
## * `` -> `...35`
## * `` -> `...36`
## * `` -> `...37`
## * `` -> `...38`
## * `` -> `...39`
## * `` -> `...40`
## * `` -> `...41`
## * `` -> `...42`
## * `` -> `...43`
## * `` -> `...44`
## * `` -> `...45`
## * `` -> `...46`
## * `` -> `...47`
## * `` -> `...48`
## * `` -> `...49`
## * `` -> `...50`
## * `` -> `...51`
## * `` -> `...52`
## * `` -> `...53`
## * `` -> `...54`
## * `` -> `...55`
## * `` -> `...56`
## * `` -> `...57`
## * `` -> `...58`
## * `` -> `...59`
## * `` -> `...60`
## * `` -> `...61`
## * `` -> `...62`
## * `` -> `...63`
## * `` -> `...64`
## * `` -> `...65`
## * `` -> `...66`

##Drop duplicate columns in OECD expenditure dataset
clim_exp_condensed = select(raw_clim_exp, -3,-5,-7,-9,-11,)

clim_exp_clean <- clim_exp_condensed %>% 
  mutate(country_code = COUNTRY, 
         country_name = Country
         ) %>% 
        clean_names() %>%
        select(-1,-2,9,-10) 
   

##How many measures are we looking at? 
clim_exp_clean %>%
  clean_names() %>%
  select(measure) %>%
  unique()

## # A tibble: 7 x 1
##   measure                                      
##   <chr>                                        
## 1 Amount (US dollars PPP)                      
## 2 Amount (US dollars PPP real)                 
## 3 Per capita (US dollars PPP)                  
## 4 Per capita (US dollars PPP real)             
## 5 % of GDP                                     
## 6 % of general government, same transaction    
## 7 % of subnational government, same transaction

write_csv(clim_exp_clean, here("01_data_cleaning", "clim_exp_clean.csv"))

##Clean Urbanization Dataset, get it ready for join

urb_lvl_long <- read_excel("C:\\Users\\arink\\OneDrive\\SAIS\\Fall 2022\\Sustainable Finance\\Final Project\\00_data_raw\\API_SP.URB.TOTL.IN.ZS_DS2_en_csv_v2_4670460.xlsx", range = "A5:BN271") %>%
  #select all of the columns corresponding to the year to pivot longer
   pivot_longer(5:66, names_to = "year", values_to = "urb_level")  %>%
    #Clean up names after the pivot to avoid making the year into a string variable
    clean_names() %>%
    #Excludes regional groups like Central Europe and the Baltics - fine for our purposes
    mutate(country_name = countrycode(country_code, origin = "wb", destination = "country.name")) %>%
  #get rid of indicator name columns
    select(-3,-4)

## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: AFE, AFW, ARB, CEB, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HIC, HPC, IBD, IBT, IDA, IDB, IDX, INX, LAC, LCN, LDC, LIC, LMC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, UMC, WLD

write_csv(urb_lvl_long, here("01_data_cleaning", "urb_lvl_long.csv"))

##Merge with other datasets

clim_exp_merged <- clim_exp_clean %>%
  # rename to iso3c to align with other datasets
  #rename(iso3c = COUNTRY) %>%
  # use countrycode country
  #mutate(country_name = countrycode(Country, origin = "iso3c", destination = "country.name")) %>%
  # First, join with the country feature dataset produced by Teal
  left_join(raw_country_feat, by = "country_name") %>%
  #Next, join with the World Bank urbanization level data, which estimates each country's level of   urbanization every year
  left_join(urb_lvl_long, by = "country_name", "year")

write_csv(clim_exp_merged, here("03_data_processed", "clim_exp_merged.csv"))

Data Cleaning

Arin Kerstein

11/3/2022