Data Transformation Practice

Author

Pinandito Wisambudi

Preparation

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2)

options(scipen=999)

blackrock_etf_screener <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv")

Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

blackrock_etf_screener

# A tibble: 424 × 18
   ticker name                        incept_date         gross_expense_ratio_…¹
   <chr>  <chr>                       <dttm>                               <dbl>
 1 IVV    iShares Core S&P 500 ETF    2000-05-15 00:00:00                   0.03
 2 IEFA   iShares Core MSCI EAFE ETF  2012-10-18 00:00:00                   0.07
 3 AGG    iShares Core U.S. Aggregat… 2003-09-22 00:00:00                   0.03
 4 IWF    iShares Russell 1000 Growt… 2000-05-22 00:00:00                   0.19
 5 IJR    iShares Core S&P Small-Cap… 2000-05-22 00:00:00                   0.06
 6 IJH    iShares Core S&P Mid-Cap E… 2000-05-22 00:00:00                   0.05
 7 IEMG   iShares Core MSCI Emerging… 2012-10-18 00:00:00                   0.09
 8 IWM    iShares Russell 2000 ETF    2000-05-22 00:00:00                   0.19
 9 IWD    iShares Russell 1000 Value… 2000-05-22 00:00:00                   0.19
10 TLT    iShares 20+ Year Treasury … 2002-07-22 00:00:00                   0.15
# ℹ 414 more rows
# ℹ abbreviated name: ¹gross_expense_ratio_percent
# ℹ 14 more variables: net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

blackrock_etf_screener |> 
  group_by(sustainable_classification) |> 
  summarize(
    # how many funds?
    n_funds = n(),
    # how much money?
    assets_usd = sum(net_assets_usd, na.rm = TRUE)
    ) |> 
  # arrange in descending order
  arrange(assets_usd |> desc())

# A tibble: 5 × 3
  sustainable_classification n_funds assets_usd
  <chr>                        <int>      <dbl>
1 <NA>                           384    2.55e12
2 Uplift                          30    5.14e10
3 Thematic                         6    3.43e 9
4 Screened                         3    3.83e 8
5 Impact                           1    3.35e 8

blackrock_etf_screener_w_new_features <- blackrock_etf_screener |> 
  mutate(
    # if the sustainable_classification column is NA, then the fund is not an ESG fund.
    standard_or_esg = if_else(
      condition = is.na(sustainable_classification),
      true = "Standard",
      false = "ESG"
    ),
    # lubridate::year() extracts the year from a date
    inception_year = lubridate::year(incept_date),
    
    # Change to a meaningful magnitude for the data. In asset management, billions is a good default. 
    net_assets_bn_usd = net_assets_usd/10^9,
    
    # let's put our new variables at the front so we can see them easily.
    .before = everything()
  )

blackrock_etf_screener_w_new_features

# A tibble: 424 × 21
   standard_or_esg inception_year net_assets_bn_usd ticker name                 
   <chr>                    <dbl>             <dbl> <chr>  <chr>                
 1 Standard                  2000             399.  IVV    iShares Core S&P 500…
 2 Standard                  2012             107.  IEFA   iShares Core MSCI EA…
 3 Standard                  2003             101.  AGG    iShares Core U.S. Ag…
 4 Standard                  2000              82.1 IWF    iShares Russell 1000…
 5 Standard                  2000              78.2 IJR    iShares Core S&P Sma…
 6 Standard                  2000              77.1 IJH    iShares Core S&P Mid…
 7 Standard                  2012              73.9 IEMG   iShares Core MSCI Em…
 8 Standard                  2000              67.7 IWM    iShares Russell 2000…
 9 Standard                  2000              55.4 IWD    iShares Russell 1000…
10 Standard                  2002              52.2 TLT    iShares 20+ Year Tre…
# ℹ 414 more rows
# ℹ 16 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

Homework Problem 1

Create a dplyr pipeline to create a tibble fulfills the following parameters:

only contains ESG funds (no standard funds)
contains the largest fund from each sustainable classification.
shows the percent of assets that the top fund comprises for its category (e.g. the top Thematic fund is x% of the total assets of all thematic funds).
Ranks each sustainable classification by the % of assets in its largest fund, and arranges them in descending order.

largest_funds <- blackrock_etf_screener_w_new_features %>%
  
  #Filter ESG funds
  filter(standard_or_esg == "ESG") %>%
  select(ticker, name, net_assets_bn_usd, sustainable_classification) %>%
  
  #Group by sustainable classification and calculate stats
  group_by(sustainable_classification) %>%
  mutate(
    fund_rank = rank(net_assets_bn_usd * -1),
    fund_percentile = percent_rank(net_assets_bn_usd),
    fund_decile = ntile(net_assets_bn_usd, n = 10),
    pct_of_total = net_assets_bn_usd/sum(net_assets_bn_usd, na.rm = TRUE) * 100,
    pct_of_total_cumulative = cumsum(pct_of_total)
  ) %>%
  
  #Filter top-ranking funds; rank these funds by concentration, format for presentation
  filter(fund_rank == 1) %>%
  ungroup() %>%
  mutate(
    fund_concentration_rank = rank(pct_of_total * -1)
  ) %>%
  select(fund_concentration_rank,ticker, name, net_assets_bn_usd, sustainable_classification,pct_of_total) %>%
  arrange(fund_concentration_rank)

print(largest_funds)

# A tibble: 4 × 6
  fund_concentration_rank ticker name   net_assets_bn_usd sustainable_classifi…¹
                    <dbl> <chr>  <chr>              <dbl> <chr>                 
1                       1 BGRN   iShar…             0.335 Impact                
2                       2 ICLN   iShar…             3.06  Thematic              
3                       3 XVV    iShar…             0.182 Screened              
4                       4 ESGU   iShar…            13.4   Uplift                
# ℹ abbreviated name: ¹sustainable_classification
# ℹ 1 more variable: pct_of_total <dbl>

Homework Problem 2

Create a sorted bar chart of the 10 smallest ETFs.

Hint: for sorting the bar chart, look up fct_reorder() . Make sure your chart has meaningful titles and labels, including numbers of an appropriate magnitude.

smallest_etfs <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg == "ESG") %>%
  select(ticker, name, net_assets_bn_usd) %>%
  arrange(net_assets_bn_usd) %>%
  head(10)

bar_chart_smallest_etfs <- ggplot(smallest_etfs,
       aes(x=net_assets_bn_usd, y=fct_reorder(name,net_assets_bn_usd)))+
  geom_bar(stat="identity",fill="navy", width=0.5)+
  labs(title="10 Smallest ETFs by Net Assets",
       x="Net Assets (in billion USD)",
       y="ETF Name")+
  theme_classic()
  
print(bar_chart_smallest_etfs)

Homework Problem 3

Find the funds in the bottom quintile of the MSCI ESG quality score.
For the funds in the bottom quintile of MSCI ESG quality scores, calculate i) the number of funds, and ii) the total assets, grouped by asset class (e.g. Equity, Fixed Income) and sub-asset class (e.g. Emerging Markets).

bottom_quantile <- blackrock_etf_screener_w_new_features %>%
  filter(!is.na(msci_esg_quality_score_0_10)) %>%
  mutate(quantile_group = ntile(msci_esg_quality_score_0_10,5)) %>%
  filter(quantile_group==1) %>%
  arrange(desc(msci_esg_quality_score_0_10))

bottom_quantile_group <- bottom_quantile %>%
  group_by(asset_class, market) %>%
  summarise(
    number_of_funds=n(),
    total_assets = sum(net_assets_bn_usd)
  )

`summarise()` has grouped output by 'asset_class'. You can override using the
`.groups` argument.

print(bottom_quantile)

# A tibble: 76 × 22
   standard_or_esg inception_year net_assets_bn_usd ticker name                 
   <chr>                    <dbl>             <dbl> <chr>  <chr>                
 1 Standard                  2022           0.00459 BPAY   BlackRock Future Fin…
 2 Standard                  2003          17.6     EEM    iShares MSCI Emergin…
 3 Standard                  2014           0.129   HEEM   iShares Currency Hed…
 4 Standard                  2014           3.48    REET   iShares Global REIT …
 5 Standard                  2022           0.00400 IBRN   iShares Neuroscience…
 6 Standard                  2012           0.128   GHYG   iShares US & Intl Hi…
 7 Standard                  2000           8.10    IJS    iShares S&P Small-Ca…
 8 Standard                  2023           0.0130  IBHJ   iShares® iBonds® 203…
 9 Standard                  2015           0.554   EMGF   iShares Emerging Mar…
10 Standard                  2022           0.0162  IBHI   iShares® iBonds® 202…
# ℹ 66 more rows
# ℹ 17 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

print(bottom_quantile_group)

# A tibble: 5 × 4
# Groups:   asset_class [3]
  asset_class  market    number_of_funds total_assets
  <chr>        <chr>               <int>        <dbl>
1 Equity       Developed              22       203.  
2 Equity       Emerging               29       124.  
3 Fixed Income Developed              16        38.6 
4 Fixed Income Emerging                5        17.7 
5 Real Estate  Developed               4         7.15

Homework Problem 4

We only began exploring this data. Use your newfound dplyr skills to uncover actionable insights from the dataset.

Show your work.
Present 5 repeatable factoids or data visualizations. As always, make sure your data visualizations have meaningful titles and labels. We will share these in class.

hhh

rating_trends <- blackrock_etf_screener_w_new_features %>%
  group_by(msci_esg_fund_rating_aaa_ccc, incept_date) %>%
  summarize(count = n()) %>%
  arrange(incept_date)

`summarise()` has grouped output by 'msci_esg_fund_rating_aaa_ccc'. You can
override using the `.groups` argument.

ggplot(rating_trends, aes(x = incept_date, y = count, color = msci_esg_fund_rating_aaa_ccc)) +
  geom_line(linewidth=0.5) +
  geom_point(size = 1) +
  labs(title = "MSCI ESG Fund Rating Trends",
       x = "Time Period",
       y = "Count",
       color = "MSCI ESG Rating") +
  theme_minimal()

ESG Fund by Asset Class (Table)

esg_by_asset_class <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg == "ESG") %>%
  group_by(asset_class) %>%
  summarize(net_assets_bn_usd = sum(net_assets_bn_usd, na.rm = TRUE)) %>%
  mutate(percentage = round((net_assets_bn_usd / sum(net_assets_bn_usd)) * 100, 1)) %>%
  arrange(desc(net_assets_bn_usd))

print(esg_by_asset_class)

# A tibble: 4 × 3
  asset_class  net_assets_bn_usd percentage
  <chr>                    <dbl>      <dbl>
1 Equity                48.8           87.9
2 Fixed Income           6.68          12  
3 Multi Asset            0.0574         0.1
4 Real Estate            0.00939        0

ESG Fund by Asset Class (Pie Chart)

esg_by_asset_class <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg == "ESG") %>%
  group_by(asset_class) %>%
  summarize(net_assets_bn_usd = sum(net_assets_bn_usd, na.rm = TRUE)) %>%
  mutate(percentage = round((net_assets_bn_usd / sum(net_assets_bn_usd)) * 100, 1)) %>%
  arrange(desc(net_assets_bn_usd))

pie_chart <- ggplot(esg_by_asset_class, aes(x="", y=net_assets_bn_usd, fill=asset_class)) +
  geom_col() + 
  coord_polar("y", start=0) +
  labs(title="ESG Fund by Asset Class",
       subtitle="The majority of the ESG funds are in Equity",
       fill="Asset Class") +
  geom_text(aes(label = paste0(percentage, "%")), position = position_stack(vjust = 0.5)) +
  theme_minimal()

pie_chart

Net Assets by Region (Bar Chart)

net_assets_by_region <- blackrock_etf_screener_w_new_features %>%
  group_by(region) %>%
  summarize(net_assets_usd = sum(net_assets_usd, na.rm = TRUE)/1000000000) %>%
  arrange(desc(net_assets_usd))  # Arrange by net assets

ggplot(net_assets_by_region, aes(x = reorder(region, -net_assets_usd), y = net_assets_usd)) +
  geom_bar(stat = "identity", fill = "pink", color = "black") +
  labs(
    x = "Region", 
    y = "Net Assets (billion USD)", 
    title = "Net Assets by Region",
    subtitle = "North America ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Carbon Intensity vs Net Assets (Scatter Plot)

library(dplyr)

# Filter out rows with missing values in MSCI weighted average carbon intensity or net assets
filtered_funds <- blackrock_etf_screener_w_new_features %>%
  filter(!is.na(msci_weighted_average_carbon_intensity_tons_co2e_m_sales) & !is.na(net_assets_bn_usd)) %>%
  filter(msci_weighted_average_carbon_intensity_tons_co2e_m_sales <= 1000,net_assets_bn_usd <= 200)

# Create a new column to categorize ESG and non-ESG funds
filtered_funds <- filtered_funds %>%
  mutate(standard_or_esg = ifelse(standard_or_esg == "ESG", "ESG", "Standard"))

# Create a scatter plot
ggplot(filtered_funds, aes(x = (net_assets_bn_usd), y =(msci_weighted_average_carbon_intensity_tons_co2e_m_sales), color = standard_or_esg)) +
  geom_point() +
  geom_smooth(method="lm", color="red")+
  scale_color_manual(values = c("ESG" = "darkgreen", "Standard" = "orange")) +
  labs(
    title = "Correlation between Carbon Intensity and Net Assets",
    subtitle = "Negative correlation: Funds with higher carbon intensity tend to have lower net assets",
    x = "Net Assets (USD)", 
    y = "MSCI Weighted Average Carbon Intensity (tCO2e/$M sales)", 
    color = "Fund Type") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

The regression line in this scatter plot shows a negative slope, which means there is a negative correlation between Net Assets and MSCI Weighted Average Carbon Intensity. As the value of the Net Assets increases, the carbon intensity tend to decreases. The outcome could relate to investor’s preference to choose lower carbon intensity assets.