HW4_NadiaXING_dataTidying

Author

Nadia XING

Problem1

options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("countrycode")

The downloaded binary packages are in
    /var/folders/pz/gf8r5yfj7r93ln6kz_gbkjtc0000gn/T//RtmpOm6y7M/downloaded_packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
imf_climate_dashboards_green_debt_url <- "https://opendata.arcgis.com/datasets/8e2772e0b65f4e33a80183ce9583d062_0.csv"

green_debt <- imf_climate_dashboards_green_debt_url |> 
  read_csv() 
Rows: 355 Columns: 42
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): Country, ISO2, ISO3, Indicator, Unit, Source, CTS_Code, CTS_Name, ...
dbl (30): ObjectId, F1985, F1986, F1987, F1990, F1991, F1992, F1993, F1994, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# we want to compare these two indicators
indicators_we_want <- c("Green Bond Issuances by Country", "Sovereign Green Bond Issuances")

green_debt_subset <- green_debt |> 
  # from the janitor package -- makes variables snake_case so they are easier to work with
  clean_names() |> 
  # filter for the vector of indicators we defined above
  filter(indicator %in% indicators_we_want) |> 
  # "f\\d{4}" is a regular expression (regex) that searches for all columns that are f + four digits.
  # Ask ChatGPT to explain this to you.
  select(country, iso3, indicator, matches("f\\d{4}")) 

green_debt_subset 
# A tibble: 107 × 32
   country iso3  indicator f1985 f1986 f1987 f1990 f1991 f1992 f1993 f1994 f1999
   <chr>   <chr> <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Argent… ARG   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 2 Austra… AUS   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 3 Austria AUT   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 4 Austria AUT   Sovereig…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 5 Bangla… BGD   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 6 Belaru… BLR   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 7 Belaru… BLR   Sovereig…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 8 Belgium BEL   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
 9 Belgium BEL   Sovereig…    NA    NA    NA    NA    NA    NA    NA    NA    NA
10 Bermuda BMU   Green Bo…    NA    NA    NA    NA    NA    NA    NA    NA    NA
# ℹ 97 more rows
# ℹ 20 more variables: f2000 <dbl>, f2002 <dbl>, f2003 <dbl>, f2004 <dbl>,
#   f2007 <dbl>, f2008 <dbl>, f2009 <dbl>, f2010 <dbl>, f2011 <dbl>,
#   f2012 <dbl>, f2013 <dbl>, f2014 <dbl>, f2015 <dbl>, f2016 <dbl>,
#   f2017 <dbl>, f2018 <dbl>, f2019 <dbl>, f2020 <dbl>, f2021 <dbl>,
#   f2022 <dbl>
library(dplyr)
library(countrycode)
green_debt_subset <- green_debt_subset |> 
  mutate(region = countrycode(iso3, "iso3c", "region")) |> 
  select(region, everything())
green_debt_subset
# A tibble: 107 × 33
   region      country iso3  indicator f1985 f1986 f1987 f1990 f1991 f1992 f1993
   <chr>       <chr>   <chr> <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Latin Amer… Argent… ARG   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 2 East Asia … Austra… AUS   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 3 Europe & C… Austria AUT   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 4 Europe & C… Austria AUT   Sovereig…    NA    NA    NA    NA    NA    NA    NA
 5 South Asia  Bangla… BGD   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 6 Europe & C… Belaru… BLR   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 7 Europe & C… Belaru… BLR   Sovereig…    NA    NA    NA    NA    NA    NA    NA
 8 Europe & C… Belgium BEL   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 9 Europe & C… Belgium BEL   Sovereig…    NA    NA    NA    NA    NA    NA    NA
10 North Amer… Bermuda BMU   Green Bo…    NA    NA    NA    NA    NA    NA    NA
# ℹ 97 more rows
# ℹ 22 more variables: f1994 <dbl>, f1999 <dbl>, f2000 <dbl>, f2002 <dbl>,
#   f2003 <dbl>, f2004 <dbl>, f2007 <dbl>, f2008 <dbl>, f2009 <dbl>,
#   f2010 <dbl>, f2011 <dbl>, f2012 <dbl>, f2013 <dbl>, f2014 <dbl>,
#   f2015 <dbl>, f2016 <dbl>, f2017 <dbl>, f2018 <dbl>, f2019 <dbl>,
#   f2020 <dbl>, f2021 <dbl>, f2022 <dbl>
green_debt_subset <- green_debt_subset |> 
  mutate(total_issuance = rowSums(across(starts_with("f")), na.rm = TRUE))

cumulative_issuance_by_region <- green_debt_subset |> 
  group_by(region) |> 
  summarize(cumulative_issuance = sum(total_issuance, na.rm = TRUE))

cumulative_issuance_by_region
# A tibble: 7 × 2
  region                     cumulative_issuance
  <chr>                                    <dbl>
1 East Asia & Pacific                      586. 
2 Europe & Central Asia                   1395. 
3 Latin America & Caribbean                 99.8
4 Middle East & North Africa                10.3
5 North America                            239. 
6 South Asia                                15.0
7 Sub-Saharan Africa                        16.2
library(ggplot2)
plot1 <- ggplot(cumulative_issuance_by_region, aes(x = reorder(region, -cumulative_issuance), y = cumulative_issuance, fill = region)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  scale_y_continuous (labels = scales::label_dollar(suffix = " bn"),
                     expand = c(0,0)) +
  labs(
    title = "Cumulative Issuance of Green Bonds by Region",
    x = "Region",
    y = "Cumulative Issuance (USD)",
    fill = "Region"
  ) +
  coord_flip()

print(plot1)

Problem2

library(tidyverse)
library(janitor)
library(scales)

Attaching package: 'scales'
The following object is masked from 'package:purrr':

    discard
The following object is masked from 'package:readr':

    col_factor
green_debt_plot2 <- green_debt |>
  clean_names() |>
  filter(type_of_issuer != "Not Applicable") |> 
  pivot_longer(
    cols = matches("f\\d{4}"),
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = list(year = readr::parse_number), 
    values_drop_na = TRUE
  ) %>%
  group_by(type_of_issuer, year) %>%
  summarise(total_issuance_bn_usd = sum(issuance_bn_usd, na.rm = TRUE))
`summarise()` has grouped output by 'type_of_issuer'. You can override using
the `.groups` argument.
green_debt_plot2 %>%
  ggplot(aes(x = year, y = total_issuance_bn_usd, fill = type_of_issuer)) +
    geom_col() +
    coord_flip()+
    scale_x_reverse(limits = c(2020,2010)) +
    scale_y_continuous(labels = label_dollar(suffix = " bn"), expand = c(0, 0)) +

    labs(
      title = "Total Green Bond Issuance by Type of Issuer",
      x = "Year",
      y = "Total Issuance (USD)"
    ) +
    theme_minimal() +
    theme(legend.position = "bottom")
Warning: Removed 37 rows containing missing values (`position_stack()`).
Warning: Removed 9 rows containing missing values (`geom_col()`).

Factoids:

From 2010 to 2020, the types of issuer are becoming more and more diversified.

Total Issuance in 2020 is more than five times bigger than it in 2015.

Problem3 Q1

library(tidyverse)
library(janitor)

green_debt_plot3 <- green_debt |> 
  clean_names() |> 
  filter(use_of_proceed != "Not Applicable") |> 
  pivot_longer(
    cols = matches("f\\d{4}"),
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = list(year = readr::parse_number),
    values_drop_na = TRUE
  ) 
print(green_debt_plot3)
# A tibble: 138 × 15
   object_id country iso2  iso3  indicator        unit  source cts_code cts_name
       <dbl> <chr>   <chr> <chr> <chr>            <chr> <chr>  <chr>    <chr>   
 1       209 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 2       210 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 3       211 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 4       212 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 5       213 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 6       214 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 7       215 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 8       216 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 9       217 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
10       218 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
# ℹ 128 more rows
# ℹ 6 more variables: cts_full_descriptor <chr>, type_of_issuer <chr>,
#   use_of_proceed <chr>, principal_currency <chr>, year <dbl>,
#   issuance_bn_usd <dbl>
green_debt_plot4 <- green_debt_plot3 |> 
filter(year == 2022 & unit == "Billion US Dollars")  
  
issuance_sum_2022 <- green_debt_plot4 |> 
  group_by(use_of_proceed) |> 
  summarize(total_issuance = sum(issuance_bn_usd, na.rm = TRUE)) |> 
  ungroup()

 
top_issuance_2022 <-issuance_sum_2022 |> 
   top_n(10, total_issuance)
# Plotting the top 10 uses of proceeds for 2022
ggplot(top_issuance_2022, aes(x = reorder(use_of_proceed, total_issuance),
                              y = total_issuance, fill = use_of_proceed)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_viridis_d() + 
  scale_y_continuous(labels = label_dollar(suffix = " bn"), expand = c(0, 0)) +
  labs(title = "Top 10 Green Bond Proceeds in 2022", x = "Use of Proceeds", y = "Total Issuance (USD)") +
  theme_minimal() +
  theme(legend.title = element_blank(), legend.position = "none")

industry_comparision_factoids <- top_issuance_2022 |> 
mutate(percentage = (total_issuance / sum(total_issuance))*100)
print(industry_comparision_factoids)
# A tibble: 10 × 3
   use_of_proceed                                      total_issuance percentage
   <chr>                                                        <dbl>      <dbl>
 1 Alternative Energy                                            25.7      1.18 
 2 Aquatic Biodiversity Conservation                             75.7      3.47 
 3 Circular Economy Adapted/Eco-efficient Products, P…           33.6      1.54 
 4 Clean Transport                                              833.      38.2  
 5 Climate Change Adaptation                                    277.      12.7  
 6 Eligible Green Projects                                      204.       9.35 
 7 Energy Efficiency                                            480.      22.0  
 8 Green Construction/Buildings                                 127.       5.83 
 9 Renewable Energy Projects                                    108.       4.97 
10 Sustainable Water or Wastewater management                    17.7      0.809

Factoids:

In 2022, the top 1 issuance of green bond in proceed is in clean transport energy, which is nearly two times bigger than the second bond in energy efficiency.

Problem3 Q2

library(tidyverse)
library(janitor)
green_debt_plot5 <- green_debt |> 
  clean_names() |> 
  filter(principal_currency != "Not Applicable") |> 
  
  pivot_longer(
    cols = matches("f\\d{4}"),
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = list(year = readr::parse_number),
    values_drop_na = TRUE
  ) 
print(green_debt_plot5)
# A tibble: 100 × 15
   object_id country iso2  iso3  indicator        unit  source cts_code cts_name
       <dbl> <chr>   <chr> <chr> <chr>            <chr> <chr>  <chr>    <chr>   
 1       109 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 2       110 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 3       111 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 4       112 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 5       113 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 6       114 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 7       115 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
 8       116 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
 9       117 World   <NA>  WLD   Cumulative Gree… Share Refin… ECFF     Green B…
10       118 World   <NA>  WLD   Cumulative Gree… Bill… Refin… ECFF     Green B…
# ℹ 90 more rows
# ℹ 6 more variables: cts_full_descriptor <chr>, type_of_issuer <chr>,
#   use_of_proceed <chr>, principal_currency <chr>, year <dbl>,
#   issuance_bn_usd <dbl>
library(tidyverse)

# Assuming green_debt_plot5 is your dataset and it has been prepared similarly to green_debt_plot2
currency_trends <- green_debt_plot5 %>% 
  group_by(year, principal_currency) %>% 
  summarise(total_issuance = sum(issuance_bn_usd, na.rm = TRUE), .groups = 'drop') %>%
  arrange(year, principal_currency)

# Assuming you want the top currencies across the entire dataset, not per year
top_currencies <- currency_trends %>%
  group_by(principal_currency) %>%
  summarise(total_issuance_overall = sum(total_issuance, na.rm = TRUE), .groups = 'drop') %>%
  top_n(10, total_issuance_overall) %>%
  pull(principal_currency)

# Filter the main dataset to include only top currencies
currency_trends_filtered <- currency_trends %>%
  filter(principal_currency %in% top_currencies)

# Visualization
currency_trends_filtered %>% 
  ggplot(aes(x = year, y = total_issuance, color = principal_currency, group = principal_currency)) +
  geom_line() +  # Assuming you want a line plot; use geom_col() for bars
  theme_minimal() +
  labs(title = "Green Bond Issuance by Currency Over Time",
       x = "Year",
       y = "Total Issuance Amount",
       color = "Currency")
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?