Final data cleanup

Final Project Data Analysis

library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(writexl)
library(dplyr)
library(countrycode)
library(here) # for using relative file paths, here() starts at /Users/ciel.wang/Desktop/JH/R/00_data_raw 
here() starts at /Users/ciel.wang/Desktop/JH/2024 spring/sustainable/final project/data
#read the excel file
setwd("/Users/ciel.wang/Desktop/JH/R/00_data_raw")

#Carbon emission data
path_to_sheet <- here("00_data_raw", "Book3.xlsx")
emission <- read_xlsx("Book3.xlsx", 
                               sheet = 'emission_by_country'  )

#green financing data
green_debt <- read_xlsx("Book3.xlsx", 
                               sheet = 'green_debt'  )

Before cleaning and analyzing the data, we first standardized the country names and codes to make sure that they are matched in both data frames:

#design a function to standardize country name and code 
country_regex_to_iso3c <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "country.name", destination = "iso3c", origin_regex = TRUE)
}

iso3c_to_country_name <- function(country_string) {
  country_string |>
    countrycode::countrycode(origin = "iso3c", destination = "country.name")
} 

Next, pivot longer each data sets and select the relevant columns:

emission <- emission %>% 
  mutate(continent = countrycode(Country_Name, origin = "country.name", destination = "continent"))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `continent = countrycode(Country_Name, origin = "country.name",
  destination = "continent")`.
Caused by warning:
! Some values were not matched unambiguously: Int. Aviation, Int. Shipping, Serbia and Montenegro, Virgin Islands_USA
emission_long <- emission |> 
  pivot_longer(
    cols= starts_with('Y_'),
    names_to = 'year',
    values_to = 'emission'
  ) |>
  rename(ISO3=Country_code_A3) |>
  mutate(year=parse_number(year)) |>
    select('ISO3','Country_Name', 'year','emission')
emission_long
# A tibble: 11,819 × 4
   ISO3  Country_Name  year emission
   <chr> <chr>        <dbl>    <dbl>
 1 ABW   Aruba         1970     45.2
 2 ABW   Aruba         1971     50.1
 3 ABW   Aruba         1972     60.9
 4 ABW   Aruba         1973     65.8
 5 ABW   Aruba         1974     65.1
 6 ABW   Aruba         1975     79.1
 7 ABW   Aruba         1976     78.2
 8 ABW   Aruba         1977     88.9
 9 ABW   Aruba         1978     93.7
10 ABW   Aruba         1979     97.7
# ℹ 11,809 more rows
green_debt_long <- green_debt |> 
  pivot_longer(
    cols= starts_with('F'),
    names_to = 'year',
    values_to = 'debt_issuance',
    values_drop_na = TRUE
  ) |>
  mutate(year=parse_number(year)) |>
  select("ISO3",Country_Name, year, debt_issuance)
green_debt_long
# A tibble: 863 × 4
   ISO3  Country_Name  year debt_issuance
   <chr> <chr>        <dbl>         <dbl>
 1 ARG   Argentina     2017        0.974 
 2 ARG   Argentina     2020        0.0500
 3 ARG   Argentina     2021        0.916 
 4 ARG   Argentina     2022        0.207 
 5 AUS   Australia     2014        0.526 
 6 AUS   Australia     2015        0.413 
 7 AUS   Australia     2016        0.531 
 8 AUS   Australia     2017        2.53  
 9 AUS   Australia     2018        2.22  
10 AUS   Australia     2019        1.98  
# ℹ 853 more rows

Then we can try to standardize country variables so that the two data frames can be merged:

#### standardzied country names and codes
emission_long |>
  mutate(ISO3 = country_regex_to_iso3c(Country_Name))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `ISO3 = country_regex_to_iso3c(Country_Name)`.
Caused by warning:
! Some values were not matched unambiguously: Int. Aviation, Int. Shipping, Netherlands Antilles, Serbia and Montenegro, Virgin Islands_USA
# A tibble: 11,819 × 4
   ISO3  Country_Name  year emission
   <chr> <chr>        <dbl>    <dbl>
 1 ABW   Aruba         1970     45.2
 2 ABW   Aruba         1971     50.1
 3 ABW   Aruba         1972     60.9
 4 ABW   Aruba         1973     65.8
 5 ABW   Aruba         1974     65.1
 6 ABW   Aruba         1975     79.1
 7 ABW   Aruba         1976     78.2
 8 ABW   Aruba         1977     88.9
 9 ABW   Aruba         1978     93.7
10 ABW   Aruba         1979     97.7
# ℹ 11,809 more rows
green_debt_long |>
  mutate(ISO3 = country_regex_to_iso3c(Country_Name))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `ISO3 = country_regex_to_iso3c(Country_Name)`.
Caused by warning:
! Some values were not matched unambiguously: World
# A tibble: 863 × 4
   ISO3  Country_Name  year debt_issuance
   <chr> <chr>        <dbl>         <dbl>
 1 ARG   Argentina     2017        0.974 
 2 ARG   Argentina     2020        0.0500
 3 ARG   Argentina     2021        0.916 
 4 ARG   Argentina     2022        0.207 
 5 AUS   Australia     2014        0.526 
 6 AUS   Australia     2015        0.413 
 7 AUS   Australia     2016        0.531 
 8 AUS   Australia     2017        2.53  
 9 AUS   Australia     2018        2.22  
10 AUS   Australia     2019        1.98  
# ℹ 853 more rows

Join the data frames by country and year and calculate the financed emission based on the sovereign debt metric:

debt_emission <- left_join( green_debt_long, emission_long,
                                     by = c("ISO3" = "ISO3", "year" = "year", "Country_Name" = "Country_Name")) |>
  group_by(ISO3, Country_Name) |>
  mutate(
    previous_year_emission = lag(emission),
    percentage_change_emission = (emission - previous_year_emission)/ previous_year_emission * 100
  ) |>
   ungroup() |>
  select("ISO3", "Country_Name", "year", "debt_issuance","emission",
        "percentage_change_emission") 

debt_emission
# A tibble: 863 × 6
   ISO3  Country_Name  year debt_issuance emission percentage_change_emission
   <chr> <chr>        <dbl>         <dbl>    <dbl>                      <dbl>
 1 ARG   Argentina     2017        0.974   379169.                    NA     
 2 ARG   Argentina     2020        0.0500  359026.                    -5.31  
 3 ARG   Argentina     2021        0.916   378420.                     5.40  
 4 ARG   Argentina     2022        0.207   382992.                     1.21  
 5 AUS   Australia     2014        0.526   585442.                    NA     
 6 AUS   Australia     2015        0.413   593561.                     1.39  
 7 AUS   Australia     2016        0.531   594024.                     0.0780
 8 AUS   Australia     2017        2.53    601065.                     1.19  
 9 AUS   Australia     2018        2.22    597774.                    -0.548 
10 AUS   Australia     2019        1.98    596032.                    -0.291 
# ℹ 853 more rows

Emission Levels for Top 5 Countries with the Highest Emissions in a Given Year

top_countries_2020 <- debt_emission %>%
  filter(year == 2020) %>%
  arrange(desc(emission)) %>%
  slice(1:5)

ggplot(data = top_countries_2020,
       aes(x = reorder(Country_Name, emission), y = emission)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 Countries by Emission Levels in 2020",
       x = "Country",
       y = "Emission Levels") +
  theme_minimal() +
  coord_flip() # Flips the axes for easier reading

top_countries_2021 <- debt_emission %>%
  filter(year == 2021) %>%
  arrange(desc(emission)) %>%
  slice(1:5)

ggplot(data = top_countries_2021,
       aes(x = reorder(Country_Name, emission), y = emission)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 Countries by Emission Levels in 2021",
       x = "Country",
       y = "Emission Levels") +
  theme_minimal() +
  coord_flip() # Flips the axes for easier reading

top_countries_2022 <- debt_emission %>%
  filter(year == 2022) %>%
  arrange(desc(emission)) %>%
  slice(1:5)

ggplot(data = top_countries_2022,
       aes(x = reorder(Country_Name, emission), y = emission)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 Countries by Emission Levels in 2022",
       x = "Country",
       y = "Emission Levels") +
  theme_minimal() +
  coord_flip() # Flips the axes for easier reading

Comparing Percentage Change in Emission Year-over-Year for Each Country

ggplot(data = debt_emission,
       aes(x = year, y = percentage_change_emission)) +
  geom_smooth(se = FALSE) +  # Add a smooth line without a confidence interval
  labs(title = "Year-over-Year Change in Emissions",
       x = "Year",
       y = "Percentage Change in Emissions") +
  theme_minimal() +
  theme(legend.position = "none")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 554 rows containing non-finite values (`stat_smooth()`).

Debt Issuance vs. Emission Scatter Plot for All Countries

ggplot(data = debt_emission,
       aes(x = debt_issuance, y = emission)) +
  geom_point() +  # Plot the data points
  geom_smooth(method = "lm",   # Add a linear regression line
              color = "blue",   # You can choose the color of the line
              se = FALSE) +    # 'se' controls the display of the confidence interval around the line
  labs(title = "Debt Issuance vs. Emissions with Best Fit Line",
       x = "Debt Issuance",
       y = "Emissions") +
  theme_minimal() +
  scale_x_continuous(limits = c(0, 100))  # Set x-axis limits
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 498 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 498 rows containing missing values (`geom_point()`).

ggplot(data = debt_emission,
       aes(x = debt_issuance, y = emission, group = factor(year), color = factor(year))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +  # Add linear regression lines for each year
  labs(title = "Debt Issuance vs. Emissions with Best Fit Line by Year",
       x = "Debt Issuance",
       y = "Emissions",
       color = "Year") +
  theme_minimal() +
  scale_x_continuous(limits = c(0, 100)) 
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 498 rows containing non-finite values (`stat_smooth()`).
Removed 498 rows containing missing values (`geom_point()`).

Folder Structure: Create 3 folders (and as many sub-folders as you would like)

  • 00_data_raw:  put your raw files in here.

  • 01_data_cleaning: put your r notebooks in hear that you use to clean the data.

  • 03_data_processed:  write your processed files into this folder (as .csv, .rds, or whatever file format you prefer).

 

Include your data tidying plan in an initial section. Try to execute your plan from step 1.   Use headers and text to narrate what you are trying to do in code chunks.

When you’ve finished tidying your data, write it to the folder 03_data_processed