assignmetn draft

chooseCRANmirror(ind=1)
install.packages("openxlsx")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'openxlsx' successfully unpacked and MD5 sums checked

Warning: cannot remove prior installation of package 'openxlsx'

Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\Runnan\AppData\Local\R\win-library\4.2\00LOCK\openxlsx\libs\x64\openxlsx.dll
to
C:\Users\Runnan\AppData\Local\R\win-library\4.2\openxlsx\libs\x64\openxlsx.dll:
Permission denied

Warning: restored 'openxlsx'


The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpmGtPwy\downloaded_packages

install.packages("countrycode")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'countrycode' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpmGtPwy\downloaded_packages

install.packages("wbstats")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'wbstats' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpmGtPwy\downloaded_packages

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(here)

here() starts at C:/Users/Runnan/Documents/sustainable finance/sustainable finance

library(readxl)

library(rnaturalearth)
library(countrycode)
library(wbstats)
library(openxlsx)

library(openxlsx)
NominalGDP <- read.xlsx("NominalGDP.xlsx")

PPPGDP <- read.xlsx("PPPGDP.xlsx")

country_features <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/country_features_2022-10.csv")

Rows: 217 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): country_name, iso3c, wb_income_group, wb_region
dbl (4): debt_gross_percent_of_gdp, nominal_gdp_bn_ppp, nominal_gdp_per_capi...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

country_features |> glimpse()

Rows: 217
Columns: 8
$ country_name               <chr> "Aruba", "Afghanistan", "Angola", "Albania"…
$ iso3c                      <chr> "ABW", "AFG", "AGO", "ALB", "AND", "ARE", "…
$ wb_income_group            <chr> "High", "Low", "Lower Middle", "Upper Middl…
$ wb_region                  <chr> "Latin America & Caribbean", "South Asia", …
$ debt_gross_percent_of_gdp  <dbl> 95.042, 7.397, 56.564, 70.278, 43.050, 30.7…
$ nominal_gdp_bn_ppp         <dbl> 5.066, 80.912, 245.442, 51.189, 5.301, 814.…
$ nominal_gdp_per_capita_ppp <dbl> 46308.66, 2456.29, 7455.47, 17858.42, 65371…
$ population_mn              <dbl> 0.109, 32.941, 32.921, 2.866, 0.081, 10.544…

file_path <- "C:/Users/Runnan/Documents/sustainable finance/oecd_emissions_data_example.xlsx"

oecd_emissions_data <- read_excel (file_path, skip=6)

New names:
• `` -> `...2`
• `` -> `...4`
• `` -> `...6`
• `` -> `...8`
• `` -> `...10`
• `` -> `...12`
• `` -> `...14`
• `` -> `...16`
• `` -> `...18`
• `` -> `...20`
• `` -> `...22`
• `` -> `...24`
• `` -> `...26`
• `` -> `...28`
• `` -> `...30`
• `` -> `...32`
• `` -> `...34`
• `` -> `...36`
• `` -> `...38`
• `` -> `...40`
• `` -> `...42`
• `` -> `...44`
• `` -> `...46`
• `` -> `...48`
• `` -> `...50`

install.packages("gdata")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

installing the source package 'gdata'

library(gdata)

gdata: Unable to locate valid perl interpreter
gdata: 
gdata: read.xls() will be unable to read Excel XLS and XLSX files
gdata: unless the 'perl=' argument is used to specify the location of a
gdata: valid perl intrpreter.
gdata: 
gdata: (To avoid display of this message in the future, please ensure
gdata: perl is installed and available on the executable search path.)

gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLX' (Excel 97-2004) files.

gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLSX' (Excel 2007+) files.

gdata: Run the function 'installXLSXsupport()'
gdata: to automatically download and install the perl
gdata: libaries needed to support Excel XLS and XLSX formats.


Attaching package: 'gdata'

The following objects are masked from 'package:dplyr':

    combine, first, last

The following object is masked from 'package:purrr':

    keep

The following object is masked from 'package:stats':

    nobs

The following object is masked from 'package:utils':

    object.size

The following object is masked from 'package:base':

    startsWith

NominalGDP

write.xlsx(NominalGDP, "NominalGDP.xlsx", rowNames = FALSE)

NominalGDP

write.xlsx(PPPGDP, "PPPGDP.xlsx", rowNames = FALSE)

PPPGDP

oecd_emissions_data

oecd_emissions_actual_data <- oecd_emissions_data |> 
  # all the column headers with data are formatted: ...[Number]
  select(Time, contains("...")) |> 
  # except for one, so get rid of it.
  select(-`...2`) |> 
  # get rid of the first row, which has no data
  filter(Time != "Country")

oecd_emissions_actual_data

correct_column_names <- oecd_emissions_data |> 
  # select all columns that don't contain `...` in their name
  select(!contains("...")) |> 
  # selects the column names as a vector
  names()

correct_column_names

 [1] "Time" "1995" "1996" "1997" "1998" "1999" "2000" "2001" "2002" "2003"
[11] "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013"
[21] "2014" "2015" "2016" "2017" "2018"

oecd_emissions_actual_step_2 <- oecd_emissions_actual_data |> 
  rename_with(.f = function(x) correct_column_names) |> 
  # rename the column to something that makes sense
  rename(geography = Time)

oecd_emissions_actual_step_2

geography_tbl <- oecd_emissions_actual_step_2 |> 
  select(geography)

geography_tbl

geography_tbl_2 <- geography_tbl |> 
  # separate the geography column 
  # into geography_code and geography_name
  # separate at the colon ":"
  # don't remove the original column (by default it removes the original column)
  separate(col = geography, into = c("geography_code", "geography_name"), 
           sep = ":", remove = FALSE) |> 
  # use str_trim() to make sure there is no extra spaces around the new names that might mess up other calculations
  mutate(across(c("geography_code", "geography_name"), str_trim))

geography_tbl_2

geography_tbl_2 |> 
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |> 
  filter(is.na(country_name))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 26 FEB 2023 00, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA

geography_processed <- geography_tbl_2 |> 
  # use countrycode() to give the countries standardized names
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |>
  # use if_else to make a column telling whether the entity is a region or a country
  # if country_name is NA, then it is a region, if not it is a country
  mutate(country_or_region = if_else(is.na(country_name), 
                                     true = "region",
                                     false = "country")) |> 
  # fill in the region names into country_name using the same logic
  mutate(country_name = if_else(is.na(country_name),
                                true = geography_name,
                                false = country_name)) |> 
  # select the columns we want. 
  # we'll rename geography_code into iso3c to match our other standardized datasets.
  # we'll use the original geography column to join it to the original dataset.
  select(geography, country_name, iso3c = geography_code, country_or_region)

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 26 FEB 2023 00, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA

geography_processed

step3 <- geography_processed |> 
  left_join(oecd_emissions_actual_step_2, by = "geography") |> 
  # now that we've joined this back to the dataset, we don't need the geography column anymore
  select(-geography)

step3

step3 <- step3[,1:14]

step3 <- step3[,-3]

step4 <- step3 |> 
  # Use a `regular expression` (aka regex) to select any column name that is 4 digits (d{4})
  pivot_longer(cols = matches("\\d{4}"), 
               # change the names column to `year`
               names_to = "year", 
               # transform it into to integer data type (from character)
               names_transform = as.integer)

step4

library(dplyr)

NominalGDP

NominalGDP_1995_2005 <- NominalGDP %>%
  select(`Country.Name`, `Country.Code`,`1995`:`2005`)

NominalGDP_1995_2005 <- rename(NominalGDP_1995_2005, iso3c = "Country.Code" )

  head(NominalGDP_1995_2005)

NominalGDP_1995_2005

NominalGDP_pivot<- NominalGDP_1995_2005 |> 
  pivot_longer(cols = matches("\\d{4}"), 
               # change the names column to `year`
               names_to = "year", 
               # transform it into to integer data type (from character)
               names_transform = as.integer)

NominalGDP_pivot

NominalGDP_pivot <- rename(NominalGDP_pivot, GDP = 'value')

head(NominalGDP_pivot)

PPPGDP

PPPGDP_1995_2005 <- PPPGDP %>%
  select(`Country.Name`, `Country.Code`,`1995`:`2005`)

PPPGDP_1995_2005 <- rename(PPPGDP_1995_2005, iso3c = "Country.Code" )

  head(PPPGDP_1995_2005)

PPPGDP_pivot<- PPPGDP_1995_2005 |> 
  pivot_longer(cols = matches("\\d{4}"), 
               # change the names column to `year`
               names_to = "year", 
               # transform it into to integer data type (from character)
               names_transform = as.integer)

PPPGDP_pivot

PPPGDP_pivot <- rename(PPPGDP_pivot, PPPGDP = 'value')

head(PPPGDP_pivot)

PPPGDP_pivot<-PPPGDP_pivot[,-c(2:3)]

PPPGDP_pivot

# Identify rows in NominalGDP_pivot that do not have a match in PPPGDP_pivot
nominal_only <- NominalGDP_pivot %>%
  anti_join(PPPGDP_pivot, by = "Country.Name")

# Identify rows in PPPGDP_pivot that do not have a match in NominalGDP_pivot
ppp_only <- PPPGDP_pivot %>%
  anti_join(NominalGDP_pivot, by = "Country.Name")

nominal_only

# A tibble: 0 × 4
# … with 4 variables: Country.Name <chr>, iso3c <chr>, year <int>, GDP <dbl>

ppp_only

# A tibble: 0 × 2
# … with 2 variables: Country.Name <chr>, PPPGDP <dbl>

merged_data <- merge(NominalGDP_pivot, PPPGDP_pivot, by = "Country.Name")

# print the merged dataset
print(merged_data)

merged_data <- merged_data[,-c(1,3)]
print(merged_data)

merged_data_emission <- merge(merged_data,step4,by="iso3c")

merged_data_emission <- merged_data_emission %>%
  rename(emission = 'value')

print(merged_data_emission)

# change column order
merged_data_emission <- merged_data_emission[, c("country_name", "iso3c", "year", "GDP", "PPPGDP","emission")]

print(merged_data_emission)

merged_data_emission$Production_emission_intensity <- merged_data_emission$emission / merged_data_emission$GDP

Production_emission_intensity <- merged_data_emission[, c("country_name","iso3c","year", "Production_emission_intensity")]

print(Production_emission_intensity)

merged_data_emission$Production_emission_intensity_PPP <- merged_data_emission$emission / merged_data_emission$PPPGDP

Production_emission_intensity_PPP <- merged_data_emission[, c("country_name","iso3c","year", "Production_emission_intensity_PPP")]

print(Production_emission_intensity_PPP)