HW Data Tidying

HW Data Tidying

library(tidyverse) # because, always
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) # for clean_names() - makes variable names snake_case

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(countrycode)

read data set:

imf_climate_dashboards_green_debt_url <- 
  "https://opendata.arcgis.com/datasets/8e2772e0b65f4e33a80183ce9583d062_0.csv"
green_debt <- imf_climate_dashboards_green_debt_url |> 
  read_csv() 
Rows: 355 Columns: 42
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): Country, ISO2, ISO3, Indicator, Unit, Source, CTS_Code, CTS_Name, ...
dbl (30): ObjectId, F1985, F1986, F1987, F1990, F1991, F1992, F1993, F1994, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Problem 1:

indicators_we_want <- c("Green Bond Issuances by Country", "Sovereign Green Bond Issuances")

green_debt_subset <- green_debt |> 
  clean_names() |> 
  filter(indicator %in% indicators_we_want) |> 
  mutate(region=countrycode(country, origin="country.name", 
                            destination="continent")) |>
  select(country, iso3, region, indicator, matches("f\\d{4}")) 
green_debt_subset 
# A tibble: 107 × 33
   country      iso3  region indicator f1985 f1986 f1987 f1990 f1991 f1992 f1993
   <chr>        <chr> <chr>  <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Argentina    ARG   Ameri… Green Bo…    NA    NA    NA    NA    NA    NA    NA
 2 Australia    AUS   Ocean… Green Bo…    NA    NA    NA    NA    NA    NA    NA
 3 Austria      AUT   Europe Green Bo…    NA    NA    NA    NA    NA    NA    NA
 4 Austria      AUT   Europe Sovereig…    NA    NA    NA    NA    NA    NA    NA
 5 Bangladesh   BGD   Asia   Green Bo…    NA    NA    NA    NA    NA    NA    NA
 6 Belarus, Re… BLR   Europe Green Bo…    NA    NA    NA    NA    NA    NA    NA
 7 Belarus, Re… BLR   Europe Sovereig…    NA    NA    NA    NA    NA    NA    NA
 8 Belgium      BEL   Europe Green Bo…    NA    NA    NA    NA    NA    NA    NA
 9 Belgium      BEL   Europe Sovereig…    NA    NA    NA    NA    NA    NA    NA
10 Bermuda      BMU   Ameri… Green Bo…    NA    NA    NA    NA    NA    NA    NA
# ℹ 97 more rows
# ℹ 22 more variables: f1994 <dbl>, f1999 <dbl>, f2000 <dbl>, f2002 <dbl>,
#   f2003 <dbl>, f2004 <dbl>, f2007 <dbl>, f2008 <dbl>, f2009 <dbl>,
#   f2010 <dbl>, f2011 <dbl>, f2012 <dbl>, f2013 <dbl>, f2014 <dbl>,
#   f2015 <dbl>, f2016 <dbl>, f2017 <dbl>, f2018 <dbl>, f2019 <dbl>,
#   f2020 <dbl>, f2021 <dbl>, f2022 <dbl>
green_bonds_region <- green_debt_subset |> 
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE
    ) |>
  group_by(region) |>
  mutate(cumulative_bn_usd_region=cumsum(issuance_bn_usd)) |>
  select(region, cumulative_bn_usd_region)
green_bonds_region
# A tibble: 465 × 2
# Groups:   region [5]
   region   cumulative_bn_usd_region
   <chr>                       <dbl>
 1 Americas                    0.974
 2 Americas                    1.02 
 3 Americas                    1.94 
 4 Americas                    2.15 
 5 Oceania                     0.526
 6 Oceania                     0.938
 7 Oceania                     1.47 
 8 Oceania                     4.00 
 9 Oceania                     6.22 
10 Oceania                     8.21 
# ℹ 455 more rows
biggest_green_bond_region <- green_bonds_region |> 
  group_by(region) |> 
  slice_max(order_by = cumulative_bn_usd_region) |> 
  arrange(desc(cumulative_bn_usd_region)) |> 
  select(region, cumulative_bn_usd_region) |> 
  ungroup()
biggest_green_bond_region
# A tibble: 5 × 2
  region   cumulative_bn_usd_region
  <chr>                       <dbl>
1 Europe                     1391. 
2 Asia                        582. 
3 Americas                    339. 
4 Oceania                      30.4
5 Africa                       19.2
green_bd_chart <- biggest_green_bond_region |> 
  ggplot(aes(x = cumulative_bn_usd_region, 
             # order regions by cumulative issuance
             y = fct_reorder(.f = region, .x = cumulative_bn_usd_region)
             )) +
  geom_col(fill = "forestgreen") +
  theme_minimal() +
  scale_x_continuous(labels = scales::label_dollar(suffix = " bn"),
                     expand = c(0,0)) +
  labs(title = "Cumulative Issuance Green Bond by Region",
       subtitle = "Europian countries do issue a whole lot of green bonds",
       x = "Cumulative Issuance (USD)",
       y = "",
       caption = "Data: IMF Climate Change Dashboard | Insight: Me!")
green_bd_chart

Problem 2:

green_debt2 <- green_debt |>
  clean_names() |>
  filter(type_of_issuer!="Not Applicable")


# Count the number of issuers for each year
issuer_by_year <- green_debt2 |>
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE) |>
  group_by(year) |>
  summarise(n=n())
issuer_by_year
# A tibble: 29 × 2
    year     n
   <dbl> <int>
 1  1985     2
 2  1986     1
 3  1987     1
 4  1990     1
 5  1991     1
 6  1992     1
 7  1993     2
 8  1994     1
 9  1999     1
10  2000     2
# ℹ 19 more rows
ggplot(issuer_by_year, aes(x = year, y = n)) +
  geom_line(color = "skyblue", size = 1) +
  geom_point(color = "darkblue", size = 2) +
  labs(title = "Green Bond Issuers Over Time",
       x = "Year",
       y = "Number of Issuers") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

#total size of bond issuance by year
green_bond3 <- green_debt2 |>
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE)

total_amount_issuance <- green_bond3 |>
  group_by(year) |>
  summarise(TotalIssuance = sum(issuance_bn_usd, na.rm = TRUE)) |>
  arrange(desc(TotalIssuance))
total_amount_issuance
# A tibble: 29 × 2
    year TotalIssuance
   <dbl>         <dbl>
 1  2021         653. 
 2  2022         541. 
 3  2020         302. 
 4  2019         284. 
 5  2018         164. 
 6  2017         139. 
 7  2016          99.6
 8  2015          47.6
 9  2014          30.8
10  2013          12.4
# ℹ 19 more rows
#percentage of bond issued in 2022 x times greater than 2021
factoid_2022 <- total_amount_issuance |>
  filter(year %in% c(2021, 2022)) |>
  mutate(PercentageIncrease = (TotalIssuance / lag(TotalIssuance)) * 100)
factoid_2022
# A tibble: 2 × 3
   year TotalIssuance PercentageIncrease
  <dbl>         <dbl>              <dbl>
1  2021          653.               NA  
2  2022          541.               82.8
#percentage of bond issued in 2022 x times greater than 2021
factoid_2021 <- total_amount_issuance |>
  filter(year %in% c(2020, 2021)) |>
  mutate(PercentageIncrease = (TotalIssuance / lag(TotalIssuance)) * 100)
factoid_2021
# A tibble: 2 × 3
   year TotalIssuance PercentageIncrease
  <dbl>         <dbl>              <dbl>
1  2021          653.               NA  
2  2020          302.               46.3

Problem 3:

# Count the number of issuers for each use of proceeds
issuer_by_use <- green_debt2 |>
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE) |>
  group_by(use_of_proceed) |>
  summarise(n=n())
issuer_by_use
# A tibble: 1 × 2
  use_of_proceed     n
  <chr>          <int>
1 Not Applicable    98
#Count the number of issuance in each principal currency
issuer_by_currency <- green_debt2 |>
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE) |>
  group_by(principal_currency) |>
  summarise(n=n())
issuer_by_currency
# A tibble: 1 × 2
  principal_currency     n
  <chr>              <int>
1 Not Applicable        98
issuer_by_currency_trend <- green_debt2 |>
  pivot_longer(
    cols = matches("f\\d{4}"), 
    names_to = "year",
    values_to = "issuance_bn_usd",
    names_transform = readr::parse_number,
    values_drop_na = TRUE) |>
  group_by(year, principal_currency) |>
  summarise(n=n())
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
issuer_by_currency_trend
# A tibble: 29 × 3
# Groups:   year [29]
    year principal_currency     n
   <dbl> <chr>              <int>
 1  1985 Not Applicable         2
 2  1986 Not Applicable         1
 3  1987 Not Applicable         1
 4  1990 Not Applicable         1
 5  1991 Not Applicable         1
 6  1992 Not Applicable         1
 7  1993 Not Applicable         2
 8  1994 Not Applicable         1
 9  1999 Not Applicable         1
10  2000 Not Applicable         2
# ℹ 19 more rows
number_each_year <- c(issuer_by_year$n)

ggplot(issuer_by_currency_trend, aes(x = year, y = n/number_each_year*100, 
                                     group=principal_currency,
                                     color=group)) +
  geom_line(color = "skyblue", size = 1) +
  geom_point(color = "darkblue", size = 2) +
  labs(title = "Currency of Issuance Over Time",
       x = "Year",
       y = "percentage of Issuers") +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal()