#hw1
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset
esg_data <- read_csv("https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/ishares_etf_screener_as_of_2023-12-27.csv")
Rows: 424 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): ticker, name, asset_class, sub_asset_class, region, market, locat...
dbl   (6): gross_expense_ratio_percent, net_expense_ratio_percent, net_asset...
dttm  (2): incept_date, net_assets_as_of

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Process the data
esg_processed <- esg_data %>%
  mutate(standard_or_esg = if_else(is.na(sustainable_classification), "Standard", "ESG")) %>%
  filter(standard_or_esg == "ESG") %>%
  group_by(sustainable_classification) %>%
  summarise(
    Top_Fund_Assets = max(net_assets_usd, na.rm = TRUE),
    Total_Assets = sum(net_assets_usd, na.rm = TRUE)
  ) %>%
  mutate(Percent_of_Assets = (Top_Fund_Assets / Total_Assets) * 100) %>%
  arrange(desc(Percent_of_Assets))



#hw2
# Filter for the 10 smallest ETFs
smallest_etfs <- esg_data %>%
  arrange(net_assets_usd) %>%
  slice_head(n = 10)

# Create a sorted bar chart
ggplot(smallest_etfs, aes(x = fct_reorder(name, net_assets_usd), y = net_assets_usd)) +
  geom_bar(stat = "identity") +
  coord_flip() + # Flips the chart to make it horizontal
  labs(title = "10 Smallest ETFs by Net Assets", x = "Fund Name", y = "Net Assets (USD)") +
  theme_minimal()

#hw3
# Find the threshold for the bottom quintile of MSCI ESG quality scores
bottom_quintile_threshold <- quantile(esg_data$msci_esg_quality_score_0_10, 0.2, na.rm = TRUE)

# Filter the funds in the bottom quintile
bottom_quintile_funds <- esg_data %>%
  filter(msci_esg_quality_score_0_10 <= bottom_quintile_threshold)

# Calculate the number of funds and total assets, grouped by asset class and sub-asset class
bottom_quintile_summary <- bottom_quintile_funds %>%
  group_by(asset_class, sub_asset_class) %>%
  summarise(
    Number_of_Funds = n(),
    Total_Assets = sum(net_assets_usd, na.rm = TRUE),
    .groups = 'drop'
  )

#hw4
library(ggplot2)
top_5_esg <- esg_data %>%
  arrange(desc(net_assets_usd)) %>%
  top_n(5, net_assets_usd)

ggplot(top_5_esg, aes(x = reorder(name, net_assets_usd), y = net_assets_usd)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 ESG Funds by Net Assets", x = "Fund Name", y = "Net Assets (USD)") +
  coord_flip()

ggplot(esg_data, aes(x = msci_esg_quality_score_0_10)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
  labs(title = "Distribution of MSCI ESG Quality Scores", x = "ESG Score", y = "Count")
Warning: Removed 44 rows containing non-finite values (`stat_bin()`).

esg_data %>%
  group_by(asset_class) %>%
  summarise(Total_Assets = sum(net_assets_usd, na.rm = TRUE)) %>%
  ggplot(aes(x = "", y = Total_Assets, fill = asset_class)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  labs(title = "Asset Class Breakdown of ESG Funds", x = "", y = "Total Assets (USD)")

ggplot(esg_data, aes(x = asset_class, y = msci_esg_quality_score_0_10)) +
  geom_boxplot() +
  labs(title = "Comparison of Average ESG Scores by Asset Class", x = "Asset Class", y = "ESG Score")
Warning: Removed 44 rows containing non-finite values (`stat_boxplot()`).

esg_data %>%
  group_by(incept_date) %>%
  summarise(Average_Size = mean(net_assets_usd, na.rm = TRUE)) %>%
  ggplot(aes(x = incept_date, y = Average_Size)) +
  geom_line() +
  labs(title = "Trend of Average ESG Fund Sizes Over Time", x = "Date", y = "Average Fund Size (USD)")