Data analysis PCAF

Author

Lara

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(tidyr)
library(dplyr)
library(ggplot2)
library(here)
here() starts at C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff
library(rnaturalearth)
library(countrycode)
library(readr)

import processed data

scope_1_1 <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/scope_1_1")
Rows: 10920 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, domestic_emissions

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
scope_1_2 <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/scope_1_2")
Rows: 2016 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, domestic_export_emissions

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
scope_2 <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/scope_2")
Rows: 2016 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, imported_grid_emissions

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
scope_3 <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/scope_3")
Rows: 2016 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, imported_total_emissions

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nominal_GDP <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/nominal_GDP")
Rows: 16492 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, nominal_gdp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
population <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/population")
Rows: 16492 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, population

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ppp_gdp <- read_csv("C:/Users/Lara/Documents/R/Sustainable Finance/Lara's R stuff/Week 1 folder/ppp_gdp")
Rows: 16492 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country_name, iso3c
dbl (2): year, ppp_gdp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

create datasets with which to work

#first join scope_1_1 and scope_1_2 since scope 1 emissions include emissions from exported goods and services 
#also keep scope_1_1 for analysis only based on this because I think there will be much info lost by joining them 

scope_1_merged <- left_join(scope_1_1, select(scope_1_2, country_name, year, domestic_export_emissions), 
                          by = c("country_name", "year"))

#lets see how much information I lost by merging the two datasets by checking which countries have no information at all for "domestic_export_emissions" 

unique_countries_scope_1 <- scope_1_merged |>
  filter(!is.na(domestic_emissions)) |>
  group_by(country_name) |>
  filter(all(is.na(domestic_export_emissions))) |>
  select(country_name) |>
  distinct()

unique_countries_scope_1
# A tibble: 145 × 1
# Groups:   country_name [145]
   country_name          
   <chr>                 
 1 International Aviation
 2 International Shipping
 3 Afghanistan           
 4 Albania               
 5 Algeria               
 6 Angola                
 7 Anguilla              
 8 Antigua & Barbuda     
 9 Armenia               
10 Aruba                 
# … with 135 more rows
#those are 145 countries for which no export emissions are available. 
#lets join them with ppp gdp and see which countries are biggest 

unique_countries_scope_1  <- unique_countries_scope_1 |>
  left_join(population, by = "country_name")
Warning in left_join(unique_countries_scope_1, population, by = "country_name"): Each row in `x` is expected to match at most 1 row in `y`.
ℹ Row 3 of `x` matches multiple rows.
ℹ If multiple matches are expected, set `multiple = "all"` to silence this
  warning.
unique_countries_scope_1 <- left_join(unique_countries_scope_1, nominal_GDP, by = c("country_name", "iso3c", "year"))

unique_countries_scope_1 <- left_join(unique_countries_scope_1, ppp_gdp, by = c("country_name", "iso3c", "year"))

# filter joined_data to keep only year 2019
cross_section_unique_scope_1_2018 <- unique_countries_scope_1 |>
  filter(year == 2018)

#plot top 10 which countries don't hava data for exported emissions 
top_10_unique_2018_scope_1 <- cross_section_unique_scope_1_2018 |>
  group_by(country_name) |>
  summarize(total_population = sum(population, na.rm = TRUE)) |>
  arrange(desc(total_population)) |>
  slice(1:10)

#turn this into a graph 

library(ggplot2)

ggplot(top_10_unique_2018_scope_1, aes(x=reorder(country_name, total_population), y = total_population)) +
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Population") +
  scale_y_continuous(labels = scales::comma) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Countries without data for exported emissions in 2018")

#build a masterdataset 

#begin with joining the basic data. 

masterdataset <- left_join(population, nominal_GDP, by = c("country_name", "iso3c", "year"))

masterdataset <- left_join(masterdataset, ppp_gdp, by = c("country_name", "iso3c", "year"))

masterdataset <- left_join(masterdataset, scope_1_merged, by = c("country_name", "iso3c", "year"))

#add domestic emissions and domestic export emissions where it is possible 

masterdataset <- masterdataset |>
  mutate(scope_1 = ifelse(is.na(domestic_emissions), 0, domestic_emissions) + 
                      ifelse(is.na(domestic_export_emissions), 0, domestic_export_emissions))|>
  mutate(scope_1 = replace(scope_1, scope_1 == 0, NA))

#now add other emissions data 

masterdataset <- left_join(masterdataset, scope_2, by = c("country_name", "iso3c", "year"))

masterdataset <- left_join(masterdataset, scope_3, by = c("country_name", "iso3c", "year"))

#create a crossection for 2018 

Masterdataset_2018 <- masterdataset|>
  filter(year == 2018)

#see who has the highest scope 1 emmissions 

Masterdataset_2018_scope1 <- Masterdataset_2018 |>
  group_by(country_name) |>
  summarize(total_scope_1 = sum(scope_1, na.rm = TRUE)) |>
  arrange(desc(total_scope_1)) |>
  slice(1:20)


ggplot(Masterdataset_2018_scope1, aes(x=reorder(country_name, total_scope_1), y = total_scope_1)) +
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Scope 1 Emissions") +
  scale_y_continuous(labels = scales::comma) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Countries with the highest scope 1 emissions in 2018")

#see who has the highest scope 2 emissions 

Masterdataset_2018_scope2 <- Masterdataset_2018 |>
  group_by(country_name) |>
  summarize(total_scope_2 = sum(imported_grid_emissions, na.rm = TRUE)) |>
  arrange(desc(total_scope_2)) |>
  slice(1:20)

#see who has the highest scope 1 emmissions 


ggplot(Masterdataset_2018_scope2, aes(x=reorder(country_name, total_scope_2), y = total_scope_2)) +
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Scope 2 Emissions") +
  scale_y_continuous(labels = scales::comma) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Countries with the highest scope 2 emissions in 2018")

#see who has the highest scope 3 emissions 

Masterdataset_2018_scope3 <- Masterdataset_2018 |>
  group_by(country_name) |>
  summarize(total_scope_3 = sum(imported_total_emissions, na.rm = TRUE)) |>
  arrange(desc(total_scope_3)) |>
  slice(1:20)

#see who has the highest scope 1 emmissions 


ggplot(Masterdataset_2018_scope3, aes(x=reorder(country_name, total_scope_3), y = total_scope_3)) +
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Scope 3 Emissions") +
  scale_y_continuous(labels = scales::comma) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Countries with the highest scope 3 emissions in 2018")