assignment 3 part 2

library(tidyverse)

# assign the url to `github_raw_csv_url`
github_raw_csv_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/blackrock_etf_screener_2022-08-30.csv"

# read in the data, and assign it to the object `blackrock_etf_data`
blackrock_etf_data <- read_csv(github_raw_csv_url)

## Rows: 393 Columns: 22
## ── Column specification ──────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): ticker, name, incept_date, net_assets_as_of, asset_class, sub_asset_class, regio...
## dbl  (8): gross_expense_ratio_percent, net_expense_ratio_percent, net_assets_usd_mn, msci_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

blackrock_etf_data <- blackrock_etf_data |> 
  # we are transforming both date columns (currently character strings) into date objects
  # so we can work with them.
  # this syntax is a bit confusing, but selects all columns containing `date` and applies
  # lubridate::mdy() function to them to turn them into date objects. 
  mutate(across(contains("date"), lubridate::mdy)) |>
  # Billions is a more useful magnitude than millions, so we'll create a column with 
  # the assets in billions by dividing by `net_assets_millions` by 1,000 (10^3)
  # If we wanted trillions, we could divide by 1,000,000 (10^6)
  mutate(net_assets_usd_bn = net_assets_usd_mn/10^3) |> 
  # this column doesn't add anything to our analysis - it says that the data is from 8/30/22
  select(-net_assets_as_of)

mini_blackrock_data <- blackrock_etf_data |> 
  # group by whether the fund is an ESG fund or not
  group_by(is_esg) |> 
  # take the top 5 from each group, by net assets
  slice_max(order_by = net_assets_usd_bn, n = 5) |> 
  # select the following columns 
  select(ticker, fund_name = name_wo_ishares_etf, asset_class, sub_asset_class, region, incept_date, net_assets_usd_bn,
         msci_weighted_average_carbon_intensity_tons_co2e_m_sales) |> 
  # rename to `co2_intensity` because the full name is a mouthful, if descriptive.
  rename(co2_intensity = msci_weighted_average_carbon_intensity_tons_co2e_m_sales) |> 
  # always good to ungroup() if you've used a group_by().  We'll discuss later.
  ungroup()

## Adding missing grouping variables: `is_esg`

mini_blackrock_data |>
  glimpse()

## Rows: 10
## Columns: 9
## $ is_esg            [3m[38;5;246m<chr>[39m[23m "ESG Fund", "ESG Fund", "ESG Fund", "ESG Fund", "ESG Fund", "Regul…
## $ ticker            [3m[38;5;246m<chr>[39m[23m "ESGU", "ESGD", "ICLN", "ESGE", "DSI", "IVV", "IEFA", "AGG", "IJR"…
## $ fund_name         [3m[38;5;246m<chr>[39m[23m "ESG Aware MSCI USA", "ESG Aware MSCI EAFE", "Global Clean Energy"…
## $ asset_class       [3m[38;5;246m<chr>[39m[23m "Equity", "Equity", "Equity", "Equity", "Equity", "Equity", "Equit…
## $ sub_asset_class   [3m[38;5;246m<chr>[39m[23m "Large/Mid Cap", "Large/Mid Cap", "All Cap", "Large/Mid Cap", "Lar…
## $ region            [3m[38;5;246m<chr>[39m[23m "North America", "Global", "Global", "Global", "North America", "N…
## $ incept_date       [3m[38;5;246m<date>[39m[23m 2016-12-01, 2016-06-28, 2008-06-24, 2016-06-28, 2006-11-14, 2000-…
## $ net_assets_usd_bn [3m[38;5;246m<dbl>[39m[23m 22.376688, 6.425673, 5.628011, 4.233544, 3.688930, 297.663232, 84.…
## $ co2_intensity     [3m[38;5;246m<dbl>[39m[23m 103.60, 103.83, 265.82, 167.71, 72.73, 148.34, 126.62, 282.72, 132…

funds_by_assets_bn <- mini_blackrock_data |> 
  select(fund_name, is_esg, region, net_assets_usd_bn) 

funds_by_assets_bn

funds_by_assets_bn |> 
  mutate(value_z_score = (net_assets_usd_bn-mean(net_assets_usd_bn))/sd(net_assets_usd_bn),
         value_rank = rank(net_assets_usd_bn),
         value_percentile = percent_rank(net_assets_usd_bn),
         value_ntile_5 = ntile(net_assets_usd_bn, 5),
         value_pct_total = net_assets_usd_bn/sum(net_assets_usd_bn)) |> 
  arrange(value_rank)

funds_by_assets_bn |> 
  mutate(value_z_score = (net_assets_usd_bn-mean(net_assets_usd_bn))/sd(net_assets_usd_bn) 
         * -1, # multiply by negative 1
         value_rank = rank(net_assets_usd_bn * -1), # multiply by negative 1
         value_percentile = percent_rank(net_assets_usd_bn * -1), # multiply by negative 1
         value_ntile_5 = ntile(net_assets_usd_bn * -1, 5), # multiply by negative 1
         value_pct_total = net_assets_usd_bn/sum(net_assets_usd_bn) # not relevant
         ) |> 
  arrange(value_rank)

# add the argument
plus_one <- function(x) {
  # define what you want it to do
  x + 1
}

plus_one(7)

## [1] 8

add_rank_features <- function(data, value_var = value, rank_by_lowest = TRUE, 
                              quantile_n = 5) {
  

  # If rank_by_lowest is set to TRUE, as it is by default, then it will use this first code chunk 
if (rank_by_lowest)
  data |> 
    mutate("{{value_var}}_z_score" := ({{ value_var }}-mean({{ value_var }}))/
             sd({{ value_var }}),
         "{{value_var}}_rank" := rank({{ value_var }}), 
         "{{value_var}}_percentile" := percent_rank({{ value_var }}),
         "{{value_var}}_quantile_{{quantile_n}}" := ntile({{ value_var }}, 
                                                          {{ quantile_n }}),
         "{{value_var}}_pct_total" := {{ value_var }}/sum({{ value_var }})) |> 
    arrange({{ value_var }})
  
   # If rank_by_lowest is set to FALSE, as it is by default, then it will use this second code chunk
else
  
  data |> 
    mutate("{{value_var}}_z_score" := ({{ value_var }}-mean({{ value_var }}))/
             sd({{ value_var }}) * -1, # multiply by -1
           # put negative sign
           "{{value_var}}_rank" := rank(-{{ value_var }}), 
           # put negative sign
           "{{value_var}}_percentile" := percent_rank(-{{ value_var }}),
           # put negative sign
           "{{value_var}}_quantile_{{quantile_n}}" := ntile(-{{ value_var }}, 
                                                            {{ quantile_n }}), 
           # no change, not relevant for pct_total
           "{{value_var}}_pct_total" := {{ value_var }}/sum({{ value_var }})) |> 
    arrange({{ value_var }}*-1)
    
}

funds_by_assets_bn |> 
  add_rank_features(value_var = net_assets_usd_bn)

funds_by_assets_bn |> 
  add_rank_features(value_var = net_assets_usd_bn, 
                    # Now it will rank by highest value equals 1
                    rank_by_lowest = FALSE, 
                    # let's divide it into 10 quantiles (deciles)
                    quantile_n = 10)

funds_by_assets_bn |> 
  group_by(is_esg) |> 
  add_rank_features(value_var = net_assets_usd_bn, 
                    # Now it will rank by highest value equals 1
                    rank_by_lowest = FALSE)

funds_by_assets_bn |> 
  group_by(is_esg, region) |> 
  add_rank_features(value_var = net_assets_usd_bn, 
                    # Now it will rank by highest value equals 1
                    rank_by_lowest = FALSE) |> 
  # arrange first by is_esg, second by region, and third by rank so you can easily
  # see the ranking within each sub-group
  arrange(is_esg, region, net_assets_usd_bn_rank)

funds_by_assets_bn |> 
  group_by(is_esg) |> 
  # always default to using na.rm = TRUE, otherwise NA values 
  # (very common in the wild) will trip up your calculations
  summarize(avg_assets = mean(net_assets_usd_bn, na.rm = TRUE))

funds_by_assets_bn |> 
  group_by(is_esg) |> 
  summarize(avg = mean(net_assets_usd_bn, na.rm = TRUE),
            std_dev = sd(net_assets_usd_bn, na.rm = TRUE),
            min = min(net_assets_usd_bn, na.rm = TRUE),
            max = max(net_assets_usd_bn, na.rm = TRUE),
            pctile_25 = quantile(net_assets_usd_bn, .25, na.rm = TRUE),
            # median is the same as the 50th percentile above
            median = median(net_assets_usd_bn, na.rm = TRUE),
            pctile_75 = quantile(net_assets_usd_bn, .75, na.rm = TRUE))

calculate_summary_stats <- function(data, value_var = value) {
  data |> 
    summarize(avg = mean({{ value_var }}, na.rm = TRUE),
              std_dev = sd({{ value_var }}, na.rm = TRUE),
              min = min({{ value_var }}, na.rm = TRUE),
              max = max({{ value_var }}, na.rm = TRUE),
              pctile_25 = quantile({{ value_var }}, .25, na.rm = TRUE),
              median = median({{ value_var }}, na.rm = TRUE),
              pctile_75 = quantile({{ value_var }}, .75, na.rm = TRUE)) |> 
  # ungrouping is helpful so we don't have to do so afterwards. Save time. 
  ungroup()
}

funds_by_assets_bn |> 
  group_by(is_esg) |> 
  calculate_summary_stats(value_var = net_assets_usd_bn)

funds_by_assets_bn |> 
  group_by(is_esg, region) |> 
  calculate_summary_stats(value_var = net_assets_usd_bn)

## `summarise()` has grouped output by 'is_esg'. You can override using the `.groups` argument.

vignette("pivot")

funds_by_assets_bn |> 
  group_by(is_esg) |> 
  calculate_summary_stats(value_var = net_assets_usd_bn)

funds_by_assets_bn |> 
  # group by the feature(s) you want summary statistics for
  group_by(is_esg) |> 
  # use the function we built to calculate those summary stats.
  calculate_summary_stats(value_var = net_assets_usd_bn) |> 
  # pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
  pivot_longer(cols = avg:pctile_75, names_to = "aggregates")

funds_by_assets_bn |> 
  # group by the feature(s) you want summary statistics for
  group_by(is_esg) |> 
  # use the function we built to calculate those summary stats.
  calculate_summary_stats(value_var = net_assets_usd_bn) |> 
  # pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
  pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |> 
  # then pivot the data into wide format where the two groups you are comparing are columns
  pivot_wider(names_from = is_esg, values_from = value)

install.packages("janitor")

## Error in install.packages : Updating loaded packages

funds_by_assets_bn |> 
  # group by the feature(s) you want summary statistics for
  group_by(is_esg) |> 
  # use the function we built to calculate those summary stats.
  calculate_summary_stats(value_var = net_assets_usd_bn) |> 
  # pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
  pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |> 
  # then pivot the data into wide format where the two groups you are comparing are columns
  pivot_wider(names_from = is_esg, values_from = value) |> 
   # make names snake_case which is easier to work with in the mutate function
  janitor::clean_names()

funds_by_assets_bn |> 
  # group by the feature(s) you want summary statistics for
  group_by(is_esg) |> 
  # use the function we built to calculate those summary stats.
  calculate_summary_stats(value_var = net_assets_usd_bn) |> 
  # pivot the data longer (tidy format) so that all of the summary stats names are in one and the values are in another.
  pivot_longer(cols = avg:pctile_75, names_to = "aggregates") |> 
  # then pivot the data into wide format where the two groups you are comparing are columns
  pivot_wider(names_from = is_esg, values_from = value) |> 
   # make names snake_case which is easier to work with in the mutate function
  janitor::clean_names() |> 
  # use your judgement to figure out the best way to compare the two groups
  mutate(esg_fund_pct_of_regular = esg_fund/regular_fund * 100,
         regular_times_larger_than_esg = regular_fund/esg_fund)

Practice

etf_comparison_data_github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"

etf_comparison <- read_csv(etf_comparison_data_github_url)

## Rows: 537 Columns: 14
## ── Column specification ──────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): ticker, company_name, sector, esg_uw_ow
## dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, esg_tilt_percen...
## lgl (3): in_esg_only, in_standard_only, in_on_index_only
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

etf_comparison

#this scatter plot shows the ESG tilt percentle distribution in different sectors and ranked by factor ESG_ETF
ggplot(data=etf_comparison,aes(x=sector,y=esg_tilt_percentile))+
  geom_point(aes(color=esg_etf),size=3)+
  labs(title="ETF ESG tilt by sector",x="sector",y="ESG tilt percentile")+
  theme_minimal()+
  theme(legend.position="bottom")

# this graph shows ESG tilt z score in different tickers. 
ggplot(data = etf_comparison, aes(x = ticker, y = esg_tilt_z_score)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "ESG Tilt Z-Score by ETF Ticker",
       x = "ETF Ticker",
       y = "ESG Tilt Z-Score") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1))

#this graph illustrates which sector has the most ESG ETF fund. in this case, information technology ranked the highest in terms fo ESG ETF
ggplot(data = etf_comparison, aes(x = sector, y = esg_etf)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(title = "Classification by sector",
       x = "sector",
       y = "ESG ETF",
       fill = "Asset Class") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")