Data Transformation Practice Questions

Author

Yuanling Zeng

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
options(scipen = 999)
blackrock_etf_screener <- read_csv ("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv")
Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Problem 1:

  • Create a dplyr pipeline to create a tibble fulfills the following parameters:

  • only contains ESG funds (no standard funds)

  • contains the largest fund from each sustainable classification.

  • shows the percent of assets that the top fund comprises for its category (e.g. the top Thematic fund is x% of the total assets of all thematic funds).

  • Ranks each sustainable classification by the % of assets in its largest fund, and arranges them in descending order.

library(dplyr)
blackrock_etf_screener %>%
  filter(!is.na(sustainable_classification)) %>%
  group_by(sustainable_classification) %>%
  summarize(
    largest_fund = max(net_assets_usd, na.rm = TRUE),
    n = n(),
    percent_of_largest_fund = (largest_fund / sum(net_assets_usd) * 100)
  ) %>%
  arrange(desc(percent_of_largest_fund)) %>%
  ungroup()
# A tibble: 4 × 4
  sustainable_classification largest_fund     n percent_of_largest_fund
  <chr>                             <dbl> <int>                   <dbl>
1 Impact                       335216065.     1                   100  
2 Thematic                    3057666513.     6                    89.2
3 Screened                     182340754.     3                    47.6
4 Uplift                     13385805595.    30                    26.0

Problem 2:

  • Create a sorted bar chart of the 10 smallest ETFs.
library(ggplot2)
smallest_etfs <- blackrock_etf_screener %>%
  arrange(net_assets_usd) %>%
  head(10) %>%
  mutate(ticker = fct_reorder(ticker, net_assets_usd))
ggplot(smallest_etfs, aes(x = ticker, y = net_assets_usd)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Top 10 Smallest ETFs by Net Assets",
       x = "ETF Name",
       y = "Net Assets (in USD)")

Problem 3:

  • Find the funds in the bottom quintile of the MSCI ESG quality score.

  • For the funds in the bottom quintile of MSCI ESG quality scores, calculate i) the number of funds, and ii) the total assets, grouped by asset class (e.g. Equity, Fixed Income) and sub-asset class (e.g. Emerging Markets).

bottom_quintile <- blackrock_etf_screener %>%
  filter(msci_esg_quality_score_0_10 <= quantile(msci_esg_quality_score_0_10, 0.2, na.rm = TRUE))
summary <- bottom_quintile %>%
  group_by(asset_class, sub_asset_class) %>%
  summarize(
    number_of_funds = n(),
    total_assets = sum(net_assets_usd),
    .groups = "drop"
  )
print(summary)
# A tibble: 9 × 4
  asset_class  sub_asset_class        number_of_funds  total_assets
  <chr>        <chr>                            <int>         <dbl>
1 Equity       All Cap                             23  95486052602.
2 Equity       Large Cap                            1   1852506812.
3 Equity       Large/Mid Cap                       10  42421984678.
4 Equity       Small Cap                           17 187292555389.
5 Fixed Income Corporates                           1     41339182.
6 Fixed Income Credit                               1    408038372.
7 Fixed Income Government                           3  17560711395.
8 Fixed Income High Yield                          16  38239624578.
9 Real Estate  Real Estate Securities               4   7145439523.

Problem 4:

  • Use your newfound dplyr skills to uncover actionable insights from the dataset.

  • Show your work.

  • Present 5 repeatable factoids or data visualizations.

data visualization 1: Bar Chart

compare the distribution of ESG funds across different asset classes within each market type.

capital_counts <- blackrock_etf_screener %>%
  group_by(market, asset_class) %>%
  summarize(num_funds = n(),.groups = "drop")
ggplot(capital_counts, aes(x = asset_class, y = num_funds, fill = market)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Asset Class Difference of ESGs in Each Market Type",
       x = "Asset Class",
       y = "Number of Funds",
       fill = "market") +
  scale_fill_manual(values = c("Developed" = "lightpink", "Emerging" = "skyblue", "Frontier" = "lightgreen")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

data visualization 2: Scatter Plot

compare MSCI ESG quality scores across two main market types

blackrock_etf_screener %>%
  filter(!is.na(msci_esg_quality_score_0_10)) %>%
  ggplot(aes(x = msci_esg_quality_score_0_10, y = market)) +
  geom_point(color = "orange") +
  labs(title = "MSCI ESG Quality Score Comparison",
       x = "MSCI ESG Quality Score",
       y = "Market") +
  theme_minimal()

data visualization 3: Pie Chart

visualize the distribution of MSCI ESG fund ratings across different categories

filtered_data <- blackrock_etf_screener %>%
  filter(!is.na(msci_esg_fund_rating_aaa_ccc))
rating_counts <- filtered_data %>%
  count(msci_esg_fund_rating_aaa_ccc)
rating_counts <- rating_counts %>%
  mutate(percent = n / sum(n) * 100)
pie_chart <- ggplot(rating_counts, aes(x = "", y = n, fill = msci_esg_fund_rating_aaa_ccc)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Distribution of MSCI ESG Fund Ratings",
       fill = "MSCI ESG Rating") +
  theme_void() +
  scale_fill_brewer(palette = "Set3") +
  geom_text(aes(label = paste0(round(percent, 1), "%")), position = position_stack(vjust = 0.5))
print(pie_chart)

data visualization 4: Box Plot

represent the distribution of MSCI ESG Quality Score across different regions

ggplot(blackrock_etf_screener, aes(x = msci_esg_quality_score_0_10, y = region, fill = region)) +
  geom_boxplot() +
  labs(title = "Quality Score Distribution in Each Region",
       x = "MSCI ESG Quality Score",
       y = "Region",
       fill = "Region") +
  theme_minimal()
Warning: Removed 44 rows containing non-finite values (`stat_boxplot()`).

data visualization 5: Bar Chart

compare the distribution of MSCI ESG Fund Ratings within each region

ggplot(blackrock_etf_screener, aes(x = msci_esg_fund_rating_aaa_ccc, fill = region)) +
  geom_bar() +
  scale_fill_manual(values = c("Asia Pacific" = "turquoise4", "Europe" = "cornflowerblue", "Global" = "lightskyblue2", "Kuwait" = "lightpink2", "Latin America" = "burlywood1", "Middle East and Africa" = "lightgoldenrod1", "North America" = "mistyrose1")) +
  labs(
    title = "Distribution of MSCI ESG Fund Ratings by Region",
    x = "Rating",
    y = "Number of Funds",
    fill = "Region"
  ) +
  theme_minimal()