Data Transformation

options(repos = c(CRAN = "https://cran.rstudio.com/"))

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.4
✔ ggplot2   3.4.4     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

blackrock_etf_screener <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv") 
Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_etf_screener
# A tibble: 424 × 18
   ticker name                        incept_date         gross_expense_ratio_…¹
   <chr>  <chr>                       <dttm>                               <dbl>
 1 IVV    iShares Core S&P 500 ETF    2000-05-15 00:00:00                   0.03
 2 IEFA   iShares Core MSCI EAFE ETF  2012-10-18 00:00:00                   0.07
 3 AGG    iShares Core U.S. Aggregat… 2003-09-22 00:00:00                   0.03
 4 IWF    iShares Russell 1000 Growt… 2000-05-22 00:00:00                   0.19
 5 IJR    iShares Core S&P Small-Cap… 2000-05-22 00:00:00                   0.06
 6 IJH    iShares Core S&P Mid-Cap E… 2000-05-22 00:00:00                   0.05
 7 IEMG   iShares Core MSCI Emerging… 2012-10-18 00:00:00                   0.09
 8 IWM    iShares Russell 2000 ETF    2000-05-22 00:00:00                   0.19
 9 IWD    iShares Russell 1000 Value… 2000-05-22 00:00:00                   0.19
10 TLT    iShares 20+ Year Treasury … 2002-07-22 00:00:00                   0.15
# ℹ 414 more rows
# ℹ abbreviated name: ¹​gross_expense_ratio_percent
# ℹ 14 more variables: net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …
blackrock_etf_screener |> 
  distinct(sustainable_classification)
# A tibble: 5 × 1
  sustainable_classification
  <chr>                     
1 <NA>                      
2 Uplift                    
3 Thematic                  
4 Impact                    
5 Screened                  
blackrock_etf_screener_w_new_features <- blackrock_etf_screener |> 
  mutate(
    # if the sustainable_classification column is NA, then the fund is not an ESG fund.
    standard_or_esg = if_else(
      condition = is.na(sustainable_classification),
      true = "Standard",
      false = "ESG"
    ),
    # lubridate::year() extracts the year from a date
    inception_year = lubridate::year(incept_date),
    
    # Change to a meaningful magnitude for the data. In asset management, billions is a good default. 
    net_assets_bn_usd = net_assets_usd/10^9,
    
    # let's put our new variables at the front so we can see them easily.
    .before = everything()
  )

blackrock_etf_screener_w_new_features
# A tibble: 424 × 21
   standard_or_esg inception_year net_assets_bn_usd ticker name                 
   <chr>                    <dbl>             <dbl> <chr>  <chr>                
 1 Standard                  2000             399.  IVV    iShares Core S&P 500…
 2 Standard                  2012             107.  IEFA   iShares Core MSCI EA…
 3 Standard                  2003             101.  AGG    iShares Core U.S. Ag…
 4 Standard                  2000              82.1 IWF    iShares Russell 1000…
 5 Standard                  2000              78.2 IJR    iShares Core S&P Sma…
 6 Standard                  2000              77.1 IJH    iShares Core S&P Mid…
 7 Standard                  2012              73.9 IEMG   iShares Core MSCI Em…
 8 Standard                  2000              67.7 IWM    iShares Russell 2000…
 9 Standard                  2000              55.4 IWD    iShares Russell 1000…
10 Standard                  2002              52.2 TLT    iShares 20+ Year Tre…
# ℹ 414 more rows
# ℹ 16 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

You can ad

2.5.0.1 Homework problem 1:

Create a dplyr pipeline to create a tibble fulfills the following parameters:

  • only contains ESG funds (no standard funds)

  • contains the largest fund from each sustainable classification.

  • shows the percent of assets that the top fund comprises for its category (e.g. the top Thematic fund is x% of the total assets of all thematic funds).

  • Ranks each sustainable classification by the % of assets in its largest fund, and arranges them in descending order.

blackrock_etf_screener_w_new_features |>
  filter(standard_or_esg == "ESG") |>
  group_by(sustainable_classification) |>
  mutate(
    fund_rank = rank(net_assets_bn_usd * -1),
    fund_percentile = percent_rank(net_assets_bn_usd * -1),
    fund_decile = ntile(net_assets_bn_usd * -1, n = 10),
    pct_of_total = net_assets_bn_usd/sum(net_assets_bn_usd, na.rm = TRUE) * 100,
    pct_of_total_cumulative = cumsum(pct_of_total)
  ) |>
  filter(fund_rank == 1) |>
  arrange(desc(pct_of_total))
# A tibble: 4 × 26
# Groups:   sustainable_classification [4]
  standard_or_esg inception_year net_assets_bn_usd ticker name                  
  <chr>                    <dbl>             <dbl> <chr>  <chr>                 
1 ESG                       2018             0.335 BGRN   iShares USD Green Bon…
2 ESG                       2008             3.06  ICLN   iShares Global Clean …
3 ESG                       2020             0.182 XVV    iShares ESG Screened …
4 ESG                       2016            13.4   ESGU   iShares ESG Aware MSC…
# ℹ 21 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>,
#   msci_esg_percent_coverage <dbl>, sustainable_classification <chr>, …

2.5.0.2 Homework problem 2:

Create a sorted bar chart of the 10 smallest ETFs.

Hint: for sorting the bar chart, look up fct_reorder() . Make sure your chart has meaningful titles and labels, including numbers of an appropriate magnitude.

smallest10 <- blackrock_etf_screener_w_new_features |>
  arrange(net_assets_bn_usd) |>
  head(10)

ggplot(smallest10, aes(x = fct_reorder(ticker, net_assets_bn_usd), y = net_assets_bn_usd)) +
  geom_bar(width = 1, stat = "identity", fill = "purple") +
  labs(title = "Top 10 Smallest ETFs by Net Assets (Billion USD)",
       x = "ETF Ticker",
       y = "Net Assets (in Billion USD)") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

2.5.0.3 Homework problem 3:

  • Find the funds in the bottom quintile of the MSCI ESG quality score.

  • calculate the number of funds by total assets by asset class and sub asset class.

quantile <- quantile(blackrock_etf_screener_w_new_features$msci_esg_quality_score_0_10, probs = 0.25, na.rm = TRUE)

#Find the funds in the bottom quintile of the MSCI ESG quality score.
bottomQ <- blackrock_etf_screener_w_new_features |>
  arrange(desc(msci_esg_quality_score_0_10)) |>
  filter(msci_esg_quality_score_0_10 <= quantile)

#Calculate the number of funds by total assets by asset class
bottomQ |>
  group_by(asset_class)|>
  summarize(countInAssetClass = n())
# A tibble: 3 × 2
  asset_class  countInAssetClass
  <chr>                    <int>
1 Equity                      52
2 Fixed Income                48
3 Real Estate                  4
#Calculate the number of funds by total assets by sub asset class
bottomQ |>
  group_by(sub_asset_class)|>
  summarize(countInAssetClass = n())
# A tibble: 10 × 2
   sub_asset_class        countInAssetClass
   <chr>                              <int>
 1 All Cap                               23
 2 Corporates                             1
 3 Credit                                 2
 4 Government                            24
 5 High Yield                            17
 6 Inflation                              4
 7 Large Cap                              1
 8 Large/Mid Cap                         10
 9 Real Estate Securities                 4
10 Small Cap                             18

You can add options to executable code like this

2.5.0.4 Homework problem 4:

We only began exploring this data. Use your newfound dplyr skills to uncover actionable insights from the dataset.

  • Show your work.

  • Present 5 repeatable factoids or data visualizations. As always, make sure your data visualizations have meaningful titles and labels. We will share these in class.

  1. Pie Chart

    Visualize the market distribution for the funds of the bottom quantile of MSCI ESG quality score

pie_data <- bottomQ %>%
  group_by(market) %>%
  summarize(total_weight = n())

pie_chart <- ggplot(pie_data, aes(x = "", y = total_weight, fill = market)) +
  geom_bar(width = 0.5, stat = "identity") +
  coord_polar("y", start = 0) + 
  labs(title = "Market distribution for the bottom quantile of the MSCI ESG quality score", fill = "market") + 
  geom_text(aes(label = paste0(round(total_weight, 1), "%")), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start = 0) + 
  
  theme_void()
Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
# Display the pie chart
print(pie_chart)

  1. Histogram

    Present the number of standard funds and the number of ESG funds by their inception year to show the historical trend

ggplot(blackrock_etf_screener_w_new_features, aes(x = inception_year, fill = factor(standard_or_esg))) +
  geom_histogram(bins = 25) +
  labs(x = "Year", y = "Number of Funds", title = "Distribution of Funds") + 
  scale_fill_discrete(name = "Standard or ESG")+
  theme_minimal()

  1. Boxplot

    Average net assest sizes across different sustainable classification

avgNA_bn <- blackrock_etf_screener_w_new_features %>%
  group_by(sustainable_classification, inception_year) %>%
  summarize(mean_avgNA = mean(net_assets_bn_usd, na.rm = T))
`summarise()` has grouped output by 'sustainable_classification'. You can
override using the `.groups` argument.
ggplot(avgNA_bn, aes(x = sustainable_classification, y = mean_avgNA)) +
  geom_boxplot(fill = "#B23333") +
  geom_jitter() +
  labs(x = "Sustainable Classfication", y = "Mean Net Asset Value", title = "Average Asset Value")+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

  1. Viridis Color Maps
library(viridis)
Loading required package: viridisLite
Num_funds <- blackrock_etf_screener_w_new_features %>%
  group_by(region) %>%
  summarize(num_funds = n(), na.rm = T)


ggplot(data = Num_funds ,aes(x=reorder(region,num_funds),y=num_funds,fill=region))+
  geom_bar(width = 2,stat = "identity")+
  coord_polar(theta = "x",start=0)+
  scale_fill_viridis(option = "D",discrete = T)+
  theme_minimal()+xlab(" ")+ylab(" ")+
  labs(title = "Regional Distribution of Funds")+
  theme(legend.position = "none")+
  scale_fill_viridis(option = "D",discrete = T)+
  theme_minimal()+xlab(" ")+ylab(" ")
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Warning: `position_stack()` requires non-overlapping x intervals

  1. Violin Plot
library(ggthemes)
library(ggsci)
library(gghalves)

blackrock_dropna <- na.omit(blackrock_etf_screener_w_new_features, cols = "Region")


ggplot(blackrock_dropna, aes(x = region, y = net_assets_bn_usd, fill = region)) +
  theme_economist() +
  labs(title = "Funds Net Value by Region",
       y = "Net Assets Value (Billion USD)",
       x = " "
  ) +
  scale_fill_nejm() +
  geom_half_violin(side = "r")