library(tidyverse)
# assign the url to `github_raw_csv_url`
github_raw_csv_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/blackrock_etf_screener_2022-08-30.csv"
# read in the data, and assign it to the object `blackrock_etf_data`
blackrock_etf_data <- read_csv(github_raw_csv_url)
## Rows: 393 Columns: 22
## ── Column specification ──────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): ticker, name, incept_date, net_assets_as_of, asset_class, sub_asset_class, regio...
## dbl (8): gross_expense_ratio_percent, net_expense_ratio_percent, net_assets_usd_mn, msci_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_etf_data <- blackrock_etf_data |>
# we are transforming both date columns (currently character strings) into date objects
# so we can work with them.
# this syntax is a bit confusing, but selects all columns containing `date` and applies
# lubridate::mdy() function to them to turn them into date objects.
mutate(across(contains("date"), lubridate::mdy)) |>
# Billions is a more useful magnitude than millions, so we'll create a column with
# the assets in billions by dividing by `net_assets_millions` by 1,000 (10^3)
# If we wanted trillions, we could divide by 1,000,000 (10^6)
mutate(net_assets_usd_bn = net_assets_usd_mn/10^3) |>
# this column doesn't add anything to our analysis - it says that the data is from 8/30/22
select(-net_assets_as_of)
mini_blackrock_data <- blackrock_etf_data |>
# group by whether the fund is an ESG fund or not
group_by(is_esg) |>
# take the top 5 from each group, by net assets
slice_max(order_by = net_assets_usd_bn, n = 5) |>
# select the following columns
select(ticker, fund_name = name_wo_ishares_etf, asset_class, sub_asset_class, region, incept_date, net_assets_usd_bn,
msci_weighted_average_carbon_intensity_tons_co2e_m_sales) |>
# rename to `co2_intensity` because the full name is a mouthful, if descriptive.
rename(co2_intensity = msci_weighted_average_carbon_intensity_tons_co2e_m_sales) |>
# always good to ungroup() if you've used a group_by(). We'll discuss later.
ungroup()
## Adding missing grouping variables: `is_esg`
mini_blackrock_data |>
glimpse()
## Rows: 10
## Columns: 9
## $ is_esg [3m[38;5;246m<chr>[39m[23m "ESG Fund", "ESG Fund", "ESG Fund", "ESG Fund", "ESG Fund", "Regul…
## $ ticker [3m[38;5;246m<chr>[39m[23m "ESGU", "ESGD", "ICLN", "ESGE", "DSI", "IVV", "IEFA", "AGG", "IJR"…
## $ fund_name [3m[38;5;246m<chr>[39m[23m "ESG Aware MSCI USA", "ESG Aware MSCI EAFE", "Global Clean Energy"…
## $ asset_class [3m[38;5;246m<chr>[39m[23m "Equity", "Equity", "Equity", "Equity", "Equity", "Equity", "Equit…
## $ sub_asset_class [3m[38;5;246m<chr>[39m[23m "Large/Mid Cap", "Large/Mid Cap", "All Cap", "Large/Mid Cap", "Lar…
## $ region [3m[38;5;246m<chr>[39m[23m "North America", "Global", "Global", "Global", "North America", "N…
## $ incept_date [3m[38;5;246m<date>[39m[23m 2016-12-01, 2016-06-28, 2008-06-24, 2016-06-28, 2006-11-14, 2000-…
## $ net_assets_usd_bn [3m[38;5;246m<dbl>[39m[23m 22.376688, 6.425673, 5.628011, 4.233544, 3.688930, 297.663232, 84.…
## $ co2_intensity [3m[38;5;246m<dbl>[39m[23m 103.60, 103.83, 265.82, 167.71, 72.73, 148.34, 126.62, 282.72, 132…
funds_by_assets_bn <- mini_blackrock_data |>
select(fund_name, is_esg, region, net_assets_usd_bn)
funds_by_assets_bn
funds_by_assets_bn |>
mutate(value_z_score = (net_assets_usd_bn-mean(net_assets_usd_bn))/sd(net_assets_usd_bn),
value_rank = rank(net_assets_usd_bn),
value_percentile = percent_rank(net_assets_usd_bn),
value_ntile_5 = ntile(net_assets_usd_bn, 5),
value_pct_total = net_assets_usd_bn/sum(net_assets_usd_bn)) |>
arrange(value_rank)
funds_by_assets_bn |>
mutate(value_z_score = (net_assets_usd_bn-mean(net_assets_usd_bn))/sd(net_assets_usd_bn)
* -1, # multiply by negative 1
value_rank = rank(net_assets_usd_bn * -1), # multiply by negative 1
value_percentile = percent_rank(net_assets_usd_bn * -1), # multiply by negative 1
value_ntile_5 = ntile(net_assets_usd_bn * -1, 5), # multiply by negative 1
value_pct_total = net_assets_usd_bn/sum(net_assets_usd_bn) # not relevant
) |>
arrange(value_rank)
# add the argument
plus_one <- function(x) {
# define what you want it to do
x + 1
}
plus_one(7)
## [1] 8
add_rank_features <- function(data, value_var = value, rank_by_lowest = TRUE,
quantile_n = 5) {
# If rank_by_lowest is set to TRUE, as it is by default, then it will use this first code chunk
if (rank_by_lowest)
data |>
mutate("{{value_var}}_z_score" := ({{ value_var }}-mean({{ value_var }}))/
sd({{ value_var }}),
"{{value_var}}_rank" := rank({{ value_var }}),
"{{value_var}}_percentile" := percent_rank({{ value_var }}),
"{{value_var}}_quantile_{{quantile_n}}" := ntile({{ value_var }},
{{ quantile_n }}),
"{{value_var}}_pct_total" := {{ value_var }}/sum({{ value_var }})) |>
arrange({{ value_var }})
# If rank_by_lowest is set to FALSE, as it is by default, then it will use this second code chunk
else
data |>
mutate("{{value_var}}_z_score" := ({{ value_var }}-mean({{ value_var }}))/
sd({{ value_var }}) * -1, # multiply by -1
# put negative sign
"{{value_var}}_rank" := rank(-{{ value_var }}),
# put negative sign
"{{value_var}}_percentile" := percent_rank(-{{ value_var }}),
# put negative sign
"{{value_var}}_quantile_{{quantile_n}}" := ntile(-{{ value_var }},
{{ quantile_n }}),
# no change, not relevant for pct_total
"{{value_var}}_pct_total" := {{ value_var }}/sum({{ value_var }})) |>
arrange({{ value_var }}*-1)
}
funds_by_assets_bn |>
add_rank_features(value_var = net_assets_usd_bn)
funds_by_assets_bn |>
add_rank_features(value_var = net_assets_usd_bn,
# Now it will rank by highest value equals 1
rank_by_lowest = FALSE,
# let's divide it into 10 quantiles (deciles)
quantile_n = 10)
funds_by_assets_bn |>
group_by(is_esg) |>
add_rank_features(value_var = net_assets_usd_bn,
# Now it will rank by highest value equals 1
rank_by_lowest = FALSE)
funds_by_assets_bn |>
group_by(is_esg, region) |>
add_rank_features(value_var = net_assets_usd_bn,
# Now it will rank by highest value equals 1
rank_by_lowest = FALSE) |>
# arrange first by is_esg, second by region, and third by rank so you can easily
# see the ranking within each sub-group
arrange(is_esg, region, net_assets_usd_bn_rank)
funds_by_assets_bn |>
group_by(is_esg) |>
# always default to using na.rm = TRUE, otherwise NA values
# (very common in the wild) will trip up your calculations
summarize(avg_assets = mean(net_assets_usd_bn, na.rm = TRUE))
funds_by_assets_bn |>
group_by(is_esg) |>
summarize(avg = mean(net_assets_usd_bn, na.rm = TRUE),
std_dev = sd(net_assets_usd_bn, na.rm = TRUE),
min = min(net_assets_usd_bn, na.rm = TRUE),
max = max(net_assets_usd_bn, na.rm = TRUE),
pctile_25 = quantile(net_assets_usd_bn, .25, na.rm = TRUE),
# median is the same as the 50th percentile above
median = median(net_assets_usd_bn, na.rm = TRUE),
pctile_75 = quantile(net_assets_usd_bn, .75, na.rm = TRUE))
calculate_summary_stats <- function(data, value_var = value) {
data |>
summarize(avg = mean({{ value_var }}, na.rm = TRUE),
std_dev = sd({{ value_var }}, na.rm = TRUE),
min = min({{ value_var }}, na.rm = TRUE),
max = max({{ value_var }}, na.rm = TRUE),
pctile_25 = quantile({{ value_var }}, .25, na.rm = TRUE),
median = median({{ value_var }}, na.rm = TRUE),
pctile_75 = quantile({{ value_var }}, .75, na.rm = TRUE)) |>
# ungrouping is helpful so we don't have to do so afterwards. Save time.
ungroup()
}
funds_by_assets_bn |>
group_by(is_esg) |>
calculate_summary_stats(value_var = net_assets_usd_bn)
funds_by_assets_bn |>
group_by(is_esg, region) |>
calculate_summary_stats(value_var = net_assets_usd_bn)
## `summarise()` has grouped output by 'is_esg'. You can override using the `.groups` argument.
vignette("pivot")
funds_by_assets_bn |>
group_by(is_esg) |>
calculate_summary_stats(value_var = net_assets_usd_bn)
funds_by_assets_bn |>
# group by the feature(s) you want summary statistics for
group_by(is_esg) |>
# use the function we built to calculate those summary stats.
calculate_summary_stats(value_var = net_assets_usd_bn) |>
# pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
pivot_longer(cols = avg:pctile_75, names_to = "aggregates")
funds_by_assets_bn |>
# group by the feature(s) you want summary statistics for
group_by(is_esg) |>
# use the function we built to calculate those summary stats.
calculate_summary_stats(value_var = net_assets_usd_bn) |>
# pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |>
# then pivot the data into wide format where the two groups you are comparing are columns
pivot_wider(names_from = is_esg, values_from = value)
install.packages("janitor")
## Error in install.packages : Updating loaded packages
funds_by_assets_bn |>
# group by the feature(s) you want summary statistics for
group_by(is_esg) |>
# use the function we built to calculate those summary stats.
calculate_summary_stats(value_var = net_assets_usd_bn) |>
# pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |>
# then pivot the data into wide format where the two groups you are comparing are columns
pivot_wider(names_from = is_esg, values_from = value) |>
# make names snake_case which is easier to work with in the mutate function
janitor::clean_names()
funds_by_assets_bn |>
# group by the feature(s) you want summary statistics for
group_by(is_esg) |>
# use the function we built to calculate those summary stats.
calculate_summary_stats(value_var = net_assets_usd_bn) |>
# pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |>
# then pivot the data into wide format where the two groups you are comparing are columns
pivot_wider(names_from = is_esg, values_from = value) |>
# make names snake_case which is easier to work with in the mutate function
janitor::clean_names() |>
# use your judgement to figure out the best way to compare the two groups
mutate(esg_fund_pct_of_regular = esg_fund/regular_fund * 100,
regular_times_larger_than_esg = regular_fund/esg_fund)
Practice
etf_comparison_data_github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"
etf_comparison <- read_csv(etf_comparison_data_github_url)
## Rows: 537 Columns: 14
## ── Column specification ──────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ticker, company_name, sector, esg_uw_ow
## dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, esg_tilt_percen...
## lgl (3): in_esg_only, in_standard_only, in_on_index_only
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
etf_comparison
#this scatter plot shows the ESG tilt percentle distribution in different sectors and ranked by factor ESG_ETF
ggplot(data=etf_comparison,aes(x=sector,y=esg_tilt_percentile))+
geom_point(aes(color=esg_etf),size=3)+
labs(title="ETF ESG tilt by sector",x="sector",y="ESG tilt percentile")+
theme_minimal()+
theme(legend.position="bottom")

# this graph shows ESG tilt z score in different tickers.
ggplot(data = etf_comparison, aes(x = ticker, y = esg_tilt_z_score)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "ESG Tilt Z-Score by ETF Ticker",
x = "ETF Ticker",
y = "ESG Tilt Z-Score") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))

#this graph illustrates which sector has the most ESG ETF fund. in this case, information technology ranked the highest in terms fo ESG ETF
ggplot(data = etf_comparison, aes(x = sector, y = esg_etf)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Classification by sector",
x = "sector",
y = "ESG ETF",
fill = "Asset Class") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
