Data Transformation

#load the tidyverse 
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# this will stop R from presenting data in scientific notation, which can be annoying. 
options(scipen = 999)

#load data
blackrock_etf_screener <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv")

## Rows: 424 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
## dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
## dttm  (2): incept_date, net_assets_as_of
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

blackrock_etf_screener

## # A tibble: 424 × 18
##    ticker name                        incept_date         gross_expense_ratio_…¹
##    <chr>  <chr>                       <dttm>                               <dbl>
##  1 IVV    iShares Core S&P 500 ETF    2000-05-15 00:00:00                   0.03
##  2 IEFA   iShares Core MSCI EAFE ETF  2012-10-18 00:00:00                   0.07
##  3 AGG    iShares Core U.S. Aggregat… 2003-09-22 00:00:00                   0.03
##  4 IWF    iShares Russell 1000 Growt… 2000-05-22 00:00:00                   0.19
##  5 IJR    iShares Core S&P Small-Cap… 2000-05-22 00:00:00                   0.06
##  6 IJH    iShares Core S&P Mid-Cap E… 2000-05-22 00:00:00                   0.05
##  7 IEMG   iShares Core MSCI Emerging… 2012-10-18 00:00:00                   0.09
##  8 IWM    iShares Russell 2000 ETF    2000-05-22 00:00:00                   0.19
##  9 IWD    iShares Russell 1000 Value… 2000-05-22 00:00:00                   0.19
## 10 TLT    iShares 20+ Year Treasury … 2002-07-22 00:00:00                   0.15
## # ℹ 414 more rows
## # ℹ abbreviated name: ¹gross_expense_ratio_percent
## # ℹ 14 more variables: net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
## #   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
## #   region <chr>, market <chr>, location <chr>, investment_style <chr>,
## #   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
## #   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

blackrock_etf_screener |> 
  group_by(sustainable_classification) |> 
  summarize(
    # how many funds?
    n_funds = n(),
    # how much money?
    assets_usd = sum(net_assets_usd, na.rm = TRUE)
    ) |> 
  # arrange in descending order
  arrange(assets_usd |> desc())

## # A tibble: 5 × 3
##   sustainable_classification n_funds assets_usd
##   <chr>                        <int>      <dbl>
## 1 <NA>                           384    2.55e12
## 2 Uplift                          30    5.14e10
## 3 Thematic                         6    3.43e 9
## 4 Screened                         3    3.83e 8
## 5 Impact                           1    3.35e 8

blackrock_etf_screener_w_new_features <- blackrock_etf_screener |> 
  mutate(
    # if the sustainable_classification column is NA, then the fund is not an ESG fund.
    standard_or_esg = if_else(
      condition = is.na(sustainable_classification),
      true = "Standard",
      false = "ESG"
    ),
    # lubridate::year() extracts the year from a date
    inception_year = lubridate::year(incept_date),
    
    # Change to a meaningful magnitude for the data. In asset management, billions is a good default. 
    net_assets_bn_usd = net_assets_usd/10^9,
    
    # let's put our new variables at the front so we can see them easily.
    .before = everything()
  )

blackrock_etf_screener_w_new_features

## # A tibble: 424 × 21
##    standard_or_esg inception_year net_assets_bn_usd ticker name                 
##    <chr>                    <dbl>             <dbl> <chr>  <chr>                
##  1 Standard                  2000             399.  IVV    iShares Core S&P 500…
##  2 Standard                  2012             107.  IEFA   iShares Core MSCI EA…
##  3 Standard                  2003             101.  AGG    iShares Core U.S. Ag…
##  4 Standard                  2000              82.1 IWF    iShares Russell 1000…
##  5 Standard                  2000              78.2 IJR    iShares Core S&P Sma…
##  6 Standard                  2000              77.1 IJH    iShares Core S&P Mid…
##  7 Standard                  2012              73.9 IEMG   iShares Core MSCI Em…
##  8 Standard                  2000              67.7 IWM    iShares Russell 2000…
##  9 Standard                  2000              55.4 IWD    iShares Russell 1000…
## 10 Standard                  2002              52.2 TLT    iShares 20+ Year Tre…
## # ℹ 414 more rows
## # ℹ 16 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
## #   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
## #   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
## #   region <chr>, market <chr>, location <chr>, investment_style <chr>,
## #   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
## #   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

#Practice Problem 1
top_funds <- blackrock_etf_screener_w_new_features %>%
  
  #keep ESG funds
  filter(standard_or_esg == "ESG") %>%
  select(ticker, name, net_assets_bn_usd, sustainable_classification) %>%
  #group by sustainable classification and calculate stats
  group_by(sustainable_classification) %>%
  mutate(
    fund_rank = rank(net_assets_bn_usd * -1),
    fund_percentile = percent_rank(net_assets_bn_usd),
    fund_decile = ntile(net_assets_bn_usd, n = 10),
    pct_of_total = net_assets_bn_usd/sum(net_assets_bn_usd, na.rm = TRUE) * 100,
    pct_of_total_cumulative = cumsum(pct_of_total)
  ) %>%
  #keep top-ranking funds; rank these funds by concentration, format for presentation
  filter(fund_rank == 1) %>%
  ungroup() %>%
  mutate(
    fund_concentration_rank = rank(pct_of_total * -1)
  ) %>%
  select(fund_concentration_rank,ticker, name, net_assets_bn_usd, sustainable_classification,pct_of_total) %>%
  arrange(fund_concentration_rank)

print(top_funds)

## # A tibble: 4 × 6
##   fund_concentration_rank ticker name   net_assets_bn_usd sustainable_classifi…¹
##                     <dbl> <chr>  <chr>              <dbl> <chr>                 
## 1                       1 BGRN   iShar…             0.335 Impact                
## 2                       2 ICLN   iShar…             3.06  Thematic              
## 3                       3 XVV    iShar…             0.182 Screened              
## 4                       4 ESGU   iShar…            13.4   Uplift                
## # ℹ abbreviated name: ¹sustainable_classification
## # ℹ 1 more variable: pct_of_total <dbl>

#Practice Problem 2

# Filter for ESG funds and select relevant columns
smallest_etfs <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg == "ESG") %>%
  select(ticker, name, net_assets_bn_usd) %>%
  # Sort by net assets in ascending order and take the top 10 smallest ETFs
  arrange(net_assets_bn_usd) %>%
  head(10)

# Create a sorted bar chart
ggplot(smallest_etfs, aes(x = fct_reorder(name, net_assets_bn_usd), y = net_assets_bn_usd)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Horizontal bars
  labs(title = "Top 10 Smallest ETFs",
       x = "ETF Name",
       y = "Net Assets (in billion USD)",
       caption = "Source: Your Data Source") +
  theme_minimal()

#Practice Problem 3

# Step 1: Find the bottom quintile of MSCI ESG quality score
bottom_quintile <- blackrock_etf_screener_w_new_features %>%
  filter(!is.na(msci_esg_quality_score_0_10)) %>%
  mutate(quantile_group = ntile(msci_esg_quality_score_0_10, 5)) %>%
  filter(quantile_group == 1)  # Selecting the bottom quintile

# Step 2: Group by asset class and sub-asset class, calculate number of funds and total assets
bottom_quintile_summary <- bottom_quintile %>%
  group_by(asset_class, sub_asset_class) %>%
  summarise(
    num_funds = n(),  # Number of funds
    total_assets = sum(net_assets_bn_usd)  # Total assets
  )

## `summarise()` has grouped output by 'asset_class'. You can override using the
## `.groups` argument.

# View the summary
print(bottom_quintile)

## # A tibble: 76 × 22
##    standard_or_esg inception_year net_assets_bn_usd ticker name                 
##    <chr>                    <dbl>             <dbl> <chr>  <chr>                
##  1 Standard                  2000              78.2 IJR    iShares Core S&P Sma…
##  2 Standard                  2012              73.9 IEMG   iShares Core MSCI Em…
##  3 Standard                  2000              67.7 IWM    iShares Russell 2000…
##  4 Standard                  2007              18.8 HYG    iShares iBoxx $ High…
##  5 Standard                  2003              17.6 EEM    iShares MSCI Emergin…
##  6 Standard                  2007              16.4 EMB    iShares J.P. Morgan …
##  7 Standard                  2007              13.7 PFF    iShares Preferred an…
##  8 Standard                  2000              12.6 IWN    iShares Russell 2000…
##  9 Standard                  2017              12.0 USHY   iShares Broad USD Hi…
## 10 Standard                  2000              11.1 IWO    iShares Russell 2000…
## # ℹ 66 more rows
## # ℹ 17 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
## #   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
## #   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
## #   region <chr>, market <chr>, location <chr>, investment_style <chr>,
## #   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
## #   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

print(bottom_quintile_summary)

## # A tibble: 9 × 4
## # Groups:   asset_class [3]
##   asset_class  sub_asset_class        num_funds total_assets
##   <chr>        <chr>                      <int>        <dbl>
## 1 Equity       All Cap                       23      95.5   
## 2 Equity       Large Cap                      1       1.85  
## 3 Equity       Large/Mid Cap                 10      42.4   
## 4 Equity       Small Cap                     17     187.    
## 5 Fixed Income Corporates                     1       0.0413
## 6 Fixed Income Credit                         1       0.408 
## 7 Fixed Income Government                     3      17.6   
## 8 Fixed Income High Yield                    16      38.2   
## 9 Real Estate  Real Estate Securities         4       7.15

#Problem 4

# Sort the dataset by "msci_weighted_average_carbon_intensity_tons_co2e_m_sales" in descending order,
# select the top 5 most carbon-intense ESG ETFs
top_5_etfs <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg == "ESG") %>%
  filter(!is.na(msci_weighted_average_carbon_intensity_tons_co2e_m_sales)) %>%
  arrange(desc(msci_weighted_average_carbon_intensity_tons_co2e_m_sales)) %>%
  head(5)

# Calculate average carbon intensity for non-ESG ETFs
non_esg_average <- blackrock_etf_screener_w_new_features %>%
  filter(standard_or_esg != "ESG" & !is.na(msci_weighted_average_carbon_intensity_tons_co2e_m_sales)) %>%
  summarise(average_carbon_intensity = mean(msci_weighted_average_carbon_intensity_tons_co2e_m_sales))

# Create a data frame for plotting
plot_data <- data.frame(
  ETF = gsub(" ETF$", "", c(top_5_etfs$name, "Average Non-ESG ETF")),  # Remove "ETF" suffix
  Carbon_Intensity = c(top_5_etfs$msci_weighted_average_carbon_intensity_tons_co2e_m_sales, non_esg_average$average_carbon_intensity),
  Type = c(rep("ESG", 5), "Non-ESG")
)

# Create the grouped bar plot
ggplot(plot_data, aes(x = ETF, y = Carbon_Intensity, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_text(aes(label = round(Carbon_Intensity, 2)), vjust = -0.5, position = position_dodge(width = 0.7), size = 3) +
  labs(title = "Top 5 Most Carbon-Intense ESG ETFs vs. Average Non-ESG",
       x = "",
       y = "MSCI Weighted Average Carbon Intensity (tons CO2e/m sales)",
       fill = "Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  ylim(0, max(plot_data$Carbon_Intensity) * 1.1)  # Adjust y-axis limits for better readability

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Data Transformation

2024-02-14