Data Transformation

Author

Ashwini Arulrajhan

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
blackrock_etf_screener <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv") 
Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_etf_screener_w_new_features <- blackrock_etf_screener |> 
  mutate(
    # if the sustainable_classification column is NA, then the fund is not an ESG fund.
    standard_or_esg = if_else(
      condition = is.na(sustainable_classification),
      true = "Standard",
      false = "ESG"
    ),
    # lubridate::year() extracts the year from a date
    inception_year = lubridate::year(incept_date),
    
    # Change to a meaningful magnitude for the data. In asset management, billions is a good default. 
    net_assets_bn_usd = net_assets_usd/10^9,
    
    # let's put our new variables at the front so we can see them easily.
    .before = everything()
  )

blackrock_etf_screener_w_new_features
# A tibble: 424 × 21
   standard_or_esg inception_year net_assets_bn_usd ticker name                 
   <chr>                    <dbl>             <dbl> <chr>  <chr>                
 1 Standard                  2000             399.  IVV    iShares Core S&P 500…
 2 Standard                  2012             107.  IEFA   iShares Core MSCI EA…
 3 Standard                  2003             101.  AGG    iShares Core U.S. Ag…
 4 Standard                  2000              82.1 IWF    iShares Russell 1000…
 5 Standard                  2000              78.2 IJR    iShares Core S&P Sma…
 6 Standard                  2000              77.1 IJH    iShares Core S&P Mid…
 7 Standard                  2012              73.9 IEMG   iShares Core MSCI Em…
 8 Standard                  2000              67.7 IWM    iShares Russell 2000…
 9 Standard                  2000              55.4 IWD    iShares Russell 1000…
10 Standard                  2002              52.2 TLT    iShares 20+ Year Tre…
# ℹ 414 more rows
# ℹ 16 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

Homework Problem 1

blackrock_etf_screener_ESG <- blackrock_etf_screener_w_new_features |>
  filter(standard_or_esg == "ESG") |> 
  group_by(sustainable_classification) |> 
  mutate(total_assets_category = sum(net_assets_bn_usd))|> 
  slice_max(order_by = net_assets_bn_usd) |> 
  
  mutate(percent_of_category = net_assets_bn_usd/total_assets_category                * 100) |> 
  arrange(desc(percent_of_category)) 
  
blackrock_etf_screener_ESG
# A tibble: 4 × 23
# Groups:   sustainable_classification [4]
  standard_or_esg inception_year net_assets_bn_usd ticker name                  
  <chr>                    <dbl>             <dbl> <chr>  <chr>                 
1 ESG                       2018             0.335 BGRN   iShares USD Green Bon…
2 ESG                       2008             3.06  ICLN   iShares Global Clean …
3 ESG                       2020             0.182 XVV    iShares ESG Screened …
4 ESG                       2016            13.4   ESGU   iShares ESG Aware MSC…
# ℹ 18 more variables: incept_date <dttm>, gross_expense_ratio_percent <dbl>,
#   net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>,
#   msci_esg_percent_coverage <dbl>, sustainable_classification <chr>, …

Homework Problem 2

blackrock_etf_screener_smallest <- blackrock_etf_screener |> 
 slice_min(order_by = net_assets_usd , n = 10) |> 
   mutate(net_assets_usd_mn = net_assets_usd / 10^6)

ggplot(blackrock_etf_screener_smallest, aes(x = fct_reorder(name, net_assets_usd_mn), y = net_assets_usd_mn)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for better readability
  labs(x = "ETF Name", y = "Asset Size (in $mn)")

Homework Problem 3

bottom_quintile <- blackrock_etf_screener_w_new_features |> 
  filter(!is.na(msci_esg_quality_score_0_10))|>   # Filter out NA values in MSCI ESG quality score
  arrange(msci_esg_quality_score_0_10) |>   # Arrange data by MSCI ESG quality score in ascending order
  mutate(quintile = ntile(msci_esg_quality_score_0_10, 5)) |>   # Create quintile groups
  filter(quintile == 1)  # Select funds in the bottom quintile

summary_stats <- bottom_quintile |> 
  group_by(asset_class, sub_asset_class) |> 
  summarise(
    number_of_funds = n(),  # Calculate the number of funds
    total_assets = sum(net_assets_bn_usd)  # Calculate the total assets
  )
`summarise()` has grouped output by 'asset_class'. You can override using the
`.groups` argument.
summary_stats
# A tibble: 9 × 4
# Groups:   asset_class [3]
  asset_class  sub_asset_class        number_of_funds total_assets
  <chr>        <chr>                            <int>        <dbl>
1 Equity       All Cap                             23      95.5   
2 Equity       Large Cap                            1       1.85  
3 Equity       Large/Mid Cap                       10      42.4   
4 Equity       Small Cap                           17     187.    
5 Fixed Income Corporates                           1       0.0413
6 Fixed Income Credit                               1       0.408 
7 Fixed Income Government                           3      17.6   
8 Fixed Income High Yield                          16      38.2   
9 Real Estate  Real Estate Securities               4       7.15  

Homework Problem 4 (Chart 1)

BlackRock ETF Composition by Asset Class

ggplot(blackrock_etf_screener_w_new_features) +
  aes(x = asset_class, y = net_assets_bn_usd) +
  geom_col(fill = "#112446") +
  labs(
    x = "Asset Classes",
    y = "Net Assets ($bn)",
    title = "BlackRock ETF Asset Class Composition"
  ) +
  theme_minimal()

Homework Problem 4 (Chart 2)

blackrock_etf_screener_w_new_features <-blackrock_etf_screener_w_new_features |> 
  group_by(standard_or_esg, asset_class)|>
    mutate(total_assets_by_assetclass_esg_standard = sum(net_assets_bn_usd)) |>
    group_by(sub_asset_class)  |>
     mutate(percent_assets_sub_assetclass = (net_assets_bn_usd/total_assets_by_assetclass_esg_standard)*100 )

equity_data <- blackrock_etf_screener_w_new_features %>%
  filter(asset_class == "Equity")

ggplot(equity_data, aes(x = sub_asset_class, y = percent_assets_sub_assetclass, fill = sub_asset_class)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ standard_or_esg) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for better readability
  labs(x = "Sub Asset Class", y = "% Assets", title = "Sub Asset Class Composition of Equity ETFs")

  scale_fill_viridis_d()
<ggproto object: Class ScaleDiscrete, Scale, gg>
    aesthetics: fill
    axis_order: function
    break_info: function
    break_positions: function
    breaks: waiver
    call: call
    clone: function
    dimension: function
    drop: TRUE
    expand: waiver
    get_breaks: function
    get_breaks_minor: function
    get_labels: function
    get_limits: function
    guide: legend
    is_discrete: function
    is_empty: function
    labels: waiver
    limits: NULL
    make_sec_title: function
    make_title: function
    map: function
    map_df: function
    n.breaks.cache: NULL
    na.translate: TRUE
    na.value: NA
    name: waiver
    palette: function
    palette.cache: NULL
    position: left
    range: environment
    rescale: function
    reset: function
    scale_name: viridis_d
    train: function
    train_df: function
    transform: function
    transform_df: function
    super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

Homework Problem 4 (Chart 3)

fixed_income_data <- blackrock_etf_screener_w_new_features %>%
  filter(asset_class == "Fixed Income")

ggplot(fixed_income_data, aes(x = sub_asset_class, y = percent_assets_sub_assetclass, fill = sub_asset_class)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ standard_or_esg) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for better readability
  labs(x = "Sub Asset Class", y = "% Assets", title = "Sub Asset Class Composition of Fixed Income ETFs")

  scale_fill_viridis_d()
<ggproto object: Class ScaleDiscrete, Scale, gg>
    aesthetics: fill
    axis_order: function
    break_info: function
    break_positions: function
    breaks: waiver
    call: call
    clone: function
    dimension: function
    drop: TRUE
    expand: waiver
    get_breaks: function
    get_breaks_minor: function
    get_labels: function
    get_limits: function
    guide: legend
    is_discrete: function
    is_empty: function
    labels: waiver
    limits: NULL
    make_sec_title: function
    make_title: function
    map: function
    map_df: function
    n.breaks.cache: NULL
    na.translate: TRUE
    na.value: NA
    name: waiver
    palette: function
    palette.cache: NULL
    position: left
    range: environment
    rescale: function
    reset: function
    scale_name: viridis_d
    train: function
    train_df: function
    transform: function
    transform_df: function
    super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

Homework Problem 4 (Chart 4)

only_ESG <- blackrock_etf_screener_w_new_features %>% filter(standard_or_esg == "ESG")

library(ggplot2)

ggplot(only_ESG) +
 aes(x = sustainable_classification, y = msci_esg_quality_score_0_10, colour = sustainable_classification, 
 size = net_assets_bn_usd) +
 geom_point(shape = "circle") +
 scale_color_hue(direction = 1) +
 labs(x = "Sustainability Classification", 
 y = "ESG Quality Score", subtitle = "ESG Quality Scores by sustainability classification") +
 theme_minimal()