Week3 HW_Linyi Zheng

Practice Problem

set up

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

options(scipen = 999)
blackrock_etf_screener <- read_csv ("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv")

Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

blackrock_etf_screener

# A tibble: 424 × 18
   ticker name                        incept_date         gross_expense_ratio_…¹
   <chr>  <chr>                       <dttm>                               <dbl>
 1 IVV    iShares Core S&P 500 ETF    2000-05-15 00:00:00                   0.03
 2 IEFA   iShares Core MSCI EAFE ETF  2012-10-18 00:00:00                   0.07
 3 AGG    iShares Core U.S. Aggregat… 2003-09-22 00:00:00                   0.03
 4 IWF    iShares Russell 1000 Growt… 2000-05-22 00:00:00                   0.19
 5 IJR    iShares Core S&P Small-Cap… 2000-05-22 00:00:00                   0.06
 6 IJH    iShares Core S&P Mid-Cap E… 2000-05-22 00:00:00                   0.05
 7 IEMG   iShares Core MSCI Emerging… 2012-10-18 00:00:00                   0.09
 8 IWM    iShares Russell 2000 ETF    2000-05-22 00:00:00                   0.19
 9 IWD    iShares Russell 1000 Value… 2000-05-22 00:00:00                   0.19
10 TLT    iShares 20+ Year Treasury … 2002-07-22 00:00:00                   0.15
# ℹ 414 more rows
# ℹ abbreviated name: ¹gross_expense_ratio_percent
# ℹ 14 more variables: net_expense_ratio_percent <dbl>, net_assets_usd <dbl>,
#   net_assets_as_of <dttm>, asset_class <chr>, sub_asset_class <chr>,
#   region <chr>, market <chr>, location <chr>, investment_style <chr>,
#   msci_esg_fund_rating_aaa_ccc <chr>, msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>, …

Problem1

Create a dplyr pipeline to create a tibble fulfills the following parameters:

only contains ESG funds (no standard funds)
contains the largest fund from each sustainable classification.
shows the percent of assets that the top fund comprises for its category (e.g. the top Thematic fund is x% of the total assets of all thematic funds).
Ranks each sustainable classification by the % of assets in its largest fund, and arranges them in descending order.

library(dplyr)

result <- blackrock_etf_screener %>%
  filter(!is.na(sustainable_classification) & sustainable_classification != "") %>%
  group_by(sustainable_classification) %>%
  mutate(Total_Assets = sum(net_assets_usd)) %>%
  slice(which.max(net_assets_usd)) %>%
  mutate(Percent_of_Total_Assets = net_assets_usd / Total_Assets * 100) %>%
  arrange(desc(Percent_of_Total_Assets))

print (result)

# A tibble: 4 × 20
# Groups:   sustainable_classification [4]
  ticker name  incept_date         gross_expense_ratio_…¹ net_expense_ratio_pe…²
  <chr>  <chr> <dttm>                               <dbl>                  <dbl>
1 BGRN   iSha… 2018-11-13 00:00:00                   0.2                    0.2 
2 ICLN   iSha… 2008-06-24 00:00:00                   0.41                   0.41
3 XVV    iSha… 2020-09-22 00:00:00                   0.08                   0.08
4 ESGU   iSha… 2016-12-01 00:00:00                   0.15                   0.15
# ℹ abbreviated names: ¹gross_expense_ratio_percent, ²net_expense_ratio_percent
# ℹ 15 more variables: net_assets_usd <dbl>, net_assets_as_of <dttm>,
#   asset_class <chr>, sub_asset_class <chr>, region <chr>, market <chr>,
#   location <chr>, investment_style <chr>, msci_esg_fund_rating_aaa_ccc <chr>,
#   msci_esg_quality_score_0_10 <dbl>,
#   msci_weighted_average_carbon_intensity_tons_co2e_m_sales <dbl>,
#   msci_esg_percent_coverage <dbl>, sustainable_classification <chr>, …

Problem 2

Create a sorted bar chart of the 10 smallest ETFs.

Hint: for sorting the bar chart, look up fct_reorder() . Make sure your chart has meaningful titles and labels, including numbers of an appropriate magnitude.

library (ggplot2)
library(forcats)

# Step 1: Select the 10 smallest ETFs based on assets under management
smallest_etfs <- blackrock_etf_screener %>%
  arrange(net_assets_usd) %>%
  head(10)

# Step 2: Sort the ETFs based on their assets under management
smallest_etfs <- smallest_etfs %>%
  mutate(name = fct_reorder(name, net_assets_usd))

# Step 3: Create a bar chart
ggplot(smallest_etfs, aes(x = name, y = net_assets_usd)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 10 Smallest ETFs",
       x = "ETF Name",
       y = "Net Assets ($)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip()  # Flip the coordinates for horizontal bars

Problem 3

Find the funds in the bottom quintile of the MSCI ESG quality score.

calculate the number of funds by total assets by asset class and sub asset class.

# Step 1: Identify the bottom quintile of funds based on the MSCI ESG quality score
bottom_quintile <- blackrock_etf_screener %>%
  filter(!is.na(msci_esg_quality_score_0_10)) %>%
  arrange(msci_esg_quality_score_0_10) %>%
  slice(1:ceiling(n() * 0.2))  

# Step 2: Group the data by asset class and sub-asset class
grouped_data <- bottom_quintile %>%
  group_by(asset_class, sub_asset_class) %>%
  summarize(total_assets = sum(net_assets_usd))

`summarise()` has grouped output by 'asset_class'. You can override using the
`.groups` argument.

# Step 3: Display the number of funds by total assets by asset class and sub-asset class
print(grouped_data)

# A tibble: 9 × 3
# Groups:   asset_class [3]
  asset_class  sub_asset_class         total_assets
  <chr>        <chr>                          <dbl>
1 Equity       All Cap                 95486052602.
2 Equity       Large Cap                1852506812.
3 Equity       Large/Mid Cap           42421984678.
4 Equity       Small Cap              187292555389.
5 Fixed Income Corporates                 41339182.
6 Fixed Income Credit                    408038372.
7 Fixed Income Government              17560711395.
8 Fixed Income High Yield              38239624578.
9 Real Estate  Real Estate Securities   7145439523.

Problem 4

1,average net expense ratio by region

factoid1 <- blackrock_etf_screener %>%
  group_by(region) %>%
  summarise(Avg_Expense_Ratio = mean(net_expense_ratio_percent)) %>%
  ggplot(aes(x = region, y = Avg_Expense_Ratio)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Expense Ratio by region",
       x = "region",
       y = "Average Expense Ratio")
print (factoid1)

2.distribution of fund sizes

factoid2 <- ggplot(blackrock_etf_screener, aes(x = net_assets_usd)) +
  geom_histogram(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Fund Sizes",
       x = "Net assets USD",
       y = "Frequency")
print (factoid2)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.