week 7

chooseCRANmirror(ind=1)
install.packages("openxlsx")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'openxlsx' successfully unpacked and MD5 sums checked

Warning: cannot remove prior installation of package 'openxlsx'

Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\Runnan\AppData\Local\R\win-library\4.2\00LOCK\openxlsx\libs\x64\openxlsx.dll
to
C:\Users\Runnan\AppData\Local\R\win-library\4.2\openxlsx\libs\x64\openxlsx.dll:
Permission denied

Warning: restored 'openxlsx'


The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpeCs4RC\downloaded_packages

install.packages("countrycode")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'countrycode' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpeCs4RC\downloaded_packages

install.packages("wbstats")

Installing package into 'C:/Users/Runnan/AppData/Local/R/win-library/4.2'
(as 'lib' is unspecified)

package 'wbstats' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Runnan\AppData\Local\Temp\RtmpeCs4RC\downloaded_packages

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(here)

here() starts at C:/Users/Runnan/Documents/sustainable finance/sustainable finance

library(readxl)

library(rnaturalearth)
library(countrycode)
library(wbstats)
library(openxlsx)

country_features <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/country_features_2022-10.csv")

Rows: 217 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): country_name, iso3c, wb_income_group, wb_region
dbl (4): debt_gross_percent_of_gdp, nominal_gdp_bn_ppp, nominal_gdp_per_capi...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

file_path_1<- "C:/Users/Runnan/Documents/sustainable finance/oecd_emissions_data_example.xlsx"

oecd_emissions_data <- read_excel (file_path_1, skip=6)

New names:
• `` -> `...2`
• `` -> `...4`
• `` -> `...6`
• `` -> `...8`
• `` -> `...10`
• `` -> `...12`
• `` -> `...14`
• `` -> `...16`
• `` -> `...18`
• `` -> `...20`
• `` -> `...22`
• `` -> `...24`
• `` -> `...26`
• `` -> `...28`
• `` -> `...30`
• `` -> `...32`
• `` -> `...34`
• `` -> `...36`
• `` -> `...38`
• `` -> `...40`
• `` -> `...42`
• `` -> `...44`
• `` -> `...46`
• `` -> `...48`
• `` -> `...50`

file_path_2<-"C:/Users/Runnan/Documents/sustainable finance/PPPGDP.xlsx"
PPPGDP<-read_excel(file_path_2, sheet="Data",skip=3)

file_path_3<-"C:/Users/Runnan/Documents/sustainable finance/CAN_ghg_profile.xlsx"
CAN_ghg_profile<-read_excel(file_path_3, sheet="Data_by_sector")

print(PPPGDP)

print(CAN_ghg_profile)

library(dplyr)

by_country_PPPGDP <- PPPGDP %>%
  filter('Country Name' == "Canada")

by_country_PPPGDP

# A tibble: 0 × 66
# … with 66 variables: Country Name <chr>, Country Code <chr>,
#   Indicator Name <chr>, Indicator Code <chr>, 1960 <lgl>, 1961 <lgl>,
#   1962 <lgl>, 1963 <lgl>, 1964 <lgl>, 1965 <lgl>, 1966 <lgl>, 1967 <lgl>,
#   1968 <lgl>, 1969 <lgl>, 1970 <lgl>, 1971 <lgl>, 1972 <lgl>, 1973 <lgl>,
#   1974 <lgl>, 1975 <lgl>, 1976 <lgl>, 1977 <lgl>, 1978 <lgl>, 1979 <lgl>,
#   1980 <lgl>, 1981 <lgl>, 1982 <lgl>, 1983 <lgl>, 1984 <lgl>, 1985 <lgl>,
#   1986 <lgl>, 1987 <lgl>, 1988 <lgl>, 1989 <lgl>, 1990 <dbl>, 1991 <dbl>, …

CAN_CO2_emissions<- CAN_ghg_profile %>%
  slice(2) %>% # Keep only the second row
  select(matches("^20[01][0-9]$")) # Keep only columns with years from 2000 to 2020
CAN_CO2_emissions

library(tidyr)

CAN_CO2_emissions_long <- CAN_CO2_emissions %>%
  pivot_longer(cols = everything(), names_to = "Year", values_to = "CO2_emissions")

CAN_CO2_emissions_long

# A tibble: 20 × 2
   Year  CO2_emissions     
   <chr> <chr>             
 1 2000  566690.44691678847
 2 2001  558777.50723075634
 3 2002  564050.93057812075
 4 2003  581305.31992669078
 5 2004  579592.04363427311
 6 2005  574653.62831229414
 7 2006  568450.93637632171
 8 2007  593515.7636122331 
 9 2008  576558.33636307728
10 2009  543967.58957655379
11 2010  556560.51755309838
12 2011  567054.13294848893
13 2012  568223.14729128906
14 2013  572613.31066858128
15 2014  569839.68719576846
16 2015  574298.19010556443
17 2016  560525.31321941037
18 2017  571544.6194087594 
19 2018  584369.11620860291
20 2019  584714.18397954514

CAN_CO2_emissions_long$CO2_emissions <- as.numeric(as.character(CAN_CO2_emissions_long$CO2_emissions))

CAN_CO2_emissions_long

library(ggplot2)

library(scales)


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

ggplot(CAN_CO2_emissions_long,aes(y=CO2_emissions,x=Year,group=2))+
        geom_line(linetype="dashed",color="#656565",size=0.5)+
         geom_point(color="#656565")+
          theme(panel.background=element_rect(fill="#e0e5df"))+
            ggtitle("Canada CO2 Emissions from 2000 to 2020")+
             scale_y_continuous(labels = comma_format(scale = 1e-0), breaks = seq(0, 600000, by = 10000))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

print(CAN_ghg_profile)

CAN_GHG_emissions<- CAN_ghg_profile %>%
  slice(5) %>% # Keep only the second row
  select(matches("^20[01][0-9]$")) # Keep only columns with years from 2000 to 2020
CAN_GHG_emissions

CAN_GHG_emissions_long <- CAN_GHG_emissions %>%
  pivot_longer(cols = everything(), names_to = "Year", values_to = "GHG_emissions")

CAN_GHG_emissions_long

# A tibble: 20 × 2
   Year  GHG_emissions     
   <chr> <chr>             
 1 2000  726987.28299329593
 2 2001  718320.04315266339
 3 2002  724281.12955518556
 4 2003  743286.85438484722
 5 2004  745194.0474627834 
 6 2005  741182.84325515444
 7 2006  734586.99002724269
 8 2007  756833.25810556859
 9 2008  739063.15164894122
10 2009  697734.94493272307
11 2010  709654.06067916669
12 2011  720820.93074176926
13 2012  725587.10194199462
14 2013  732162.30161086132
15 2014  729599.78344042308
16 2015  732536.87240600237
17 2016  715095.85735205631
18 2017  725015.64661236701
19 2018  740006.26956342754
20 2019  738283.44606874988

CAN_GHG_emissions_long$GHG_emissions <- as.numeric(as.character(CAN_GHG_emissions_long$GHG_emissions))

CAN_GHG_emissions_long

# A tibble: 20 × 2
   Year  GHG_emissions
   <chr>         <dbl>
 1 2000        726987.
 2 2001        718320.
 3 2002        724281.
 4 2003        743287.
 5 2004        745194.
 6 2005        741183.
 7 2006        734587.
 8 2007        756833.
 9 2008        739063.
10 2009        697735.
11 2010        709654.
12 2011        720821.
13 2012        725587.
14 2013        732162.
15 2014        729600.
16 2015        732537.
17 2016        715096.
18 2017        725016.
19 2018        740006.
20 2019        738283.

Emissions<- merge(CAN_CO2_emissions_long,CAN_GHG_emissions_long,by=c("Year"))

Emissions

ggplot(Emissions, aes(x = Year)) +
  geom_point(aes(y = CO2_emissions, color = "CO2")) +
  geom_point(aes(y = GHG_emissions, color = "GHG")) +
  geom_line(aes(y = CO2_emissions, color = "CO2", group = 1)) +
  geom_line(aes(y = GHG_emissions, color = "GHG", group = 1)) +
  scale_color_manual("", values = c("CO2" = "#7b8b6f", "GHG" = "#8696a7")) +
  labs(title = "Canadian CO2 and GHG Emissions",
       x = "Year",
       y = "Emissions (kt CO2e)") +
      theme(panel.background=element_rect(fill="#e0e5df"))+
  scale_y_continuous(labels = comma_format(scale = 1e-0), breaks = seq(0, 800000, by = 10000))

PPPGDP

PPPGDP_20YEARS<- PPPGDP %>%
  select("Country Name", "Country Code", "Indicator Name", matches("^20[01][0-9]$")) # Keep only columns with years from 2000 to 2020
PPPGDP_20YEARS

oecd_emissions_data

oecd_emissions_actual_data <- oecd_emissions_data |> 
  # all the column headers with data are formatted: ...[Number]
  select(Time, contains("...")) |> 
  # except for one, so get rid of it.
  select(-`...2`) |> 
  # get rid of the first row, which has no data
  filter(Time != "Country")

oecd_emissions_actual_data

correct_column_names <- oecd_emissions_data |> 
  # select all columns that don't contain `...` in their name
  select(!contains("...")) |> 
  # selects the column names as a vector
  names()

correct_column_names

oecd_emissions_actual_step_2 <- oecd_emissions_actual_data |> 
  rename_with(.f = function(x) correct_column_names) |> 
  # rename the column to something that makes sense
  rename(geography = Time)

oecd_emissions_actual_step_2

geography_tbl <- oecd_emissions_actual_step_2 |> 
  select(geography)

geography_tbl

geography_tbl_2 <- geography_tbl |> 
  # separate the geography column 
  # into geography_code and geography_name
  # separate at the colon ":"
  # don't remove the original column (by default it removes the original column)
  separate(col = geography, into = c("geography_code", "geography_name"), 
           sep = ":", remove = FALSE) |> 
  # use str_trim() to make sure there is no extra spaces around the new names that might mess up other calculations
  mutate(across(c("geography_code", "geography_name"), str_trim))

geography_tbl_2

geography_tbl_2 |> 
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |> 
  filter(is.na(country_name))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 26 FEB 2023 00, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA

geography_processed <- geography_tbl_2 |> 
  # use countrycode() to give the countries standardized names
  mutate(country_name = countrycode(sourcevar = geography_code, 
                                    origin = "iso3c", 
                                    destination = "country.name")) |>
  # use if_else to make a column telling whether the entity is a region or a country
  # if country_name is NA, then it is a region, if not it is a country
  mutate(country_or_region = if_else(is.na(country_name), 
                                     true = "region",
                                     false = "country")) |> 
  # fill in the region names into country_name using the same logic
  mutate(country_name = if_else(is.na(country_name),
                                true = geography_name,
                                false = country_name)) |> 
  # select the columns we want. 
  # we'll rename geography_code into iso3c to match our other standardized datasets.
  # we'll use the original geography column to join it to the original dataset.
  select(geography, country_name, iso3c = geography_code, country_or_region)

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `country_name = countrycode(sourcevar = geography_code, origin =
  "iso3c", destination = "country.name")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: APEC, ASEAN, DATA EXTRACTED ON 26 FEB 2023 00, EA19, EASIA, EU13, EU15, EU27_2020, EU28, G20, NONOECD, OECD, ROW, WLD, ZASI, ZEUR, ZNAM, ZOTH, ZSCA

geography_processed

step3 <- geography_processed |> 
  left_join(oecd_emissions_actual_step_2, by = "geography") |> 
  # now that we've joined this back to the dataset, we don't need the geography column anymore
  select(-geography)

step3

step3 <- step3[,1:14]

step3 <- step3[,-3]

step4 <- step3 |> 
  # Use a `regular expression` (aka regex) to select any column name that is 4 digits (d{4})
  pivot_longer(cols = matches("\\d{4}"), 
               # change the names column to `year`
               names_to = "year", 
               # transform it into to integer data type (from character)
               names_transform = as.integer)

step4

step4<-step4%>%
  select(-iso3c)

step4 <- step4[-(1:20), ]

step4

step4_sub <- step4[step4$year >= 2000 & step4$year <= 2005, ]
step4_sub

selected_countries <- c("Australia", "Germany","Japan", "South Korea")
oecd_emissions_filtered <- step4_sub %>%
  filter(country_name%in% selected_countries)
oecd_emissions_filtered

ggplot(oecd_emissions_filtered, aes(x = country_name, y = value)) +
  geom_boxplot() +
  labs(title = "CO2 Emissions by Country",
       x = "Country", y = "CO2 Emissions")

ggplot(oecd_emissions_filtered, aes(x = year, y = value, color = value)) +
  geom_line(size = 1) +
  facet_wrap(~ country_name, ncol = 2) +
  labs(title = "CO2 Emissions by Sector and Year",
       x = "Year", y = "CO2 Emissions",
       color = "Value") +
  scale_color_gradient(low = "blue", high = "red")