Data Visualization HW

Author

Rachel Zhang

Data Analysis Boot Camp: Data Visualization

1.5.1.2 Homework problem 1: Recreate the chart above

options(repos = c(CRAN = "https://cran.rstudio.com/"))
#remotes::install_github("dreamRs/esquisse")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(esquisse)
library(ggplot2)

github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"


# read the data from GitHub
blackrock_esg_vs_non_esg_etf <- github_url |> 
  read_csv() |> 
  # select the four columns we will use in our anlaysis here
  select(company_name:standard_etf)
Rows: 537 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ticker, company_name, sector, esg_uw_ow
dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, e...
lgl (3): in_esg_only, in_standard_only, in_on_index_only

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = esg_etf, y = standard_etf) +
  geom_point(
    shape = "circle",
    size = 1.65,
    colour = "#112446"
  ) +
  geom_smooth(span = 0.82) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(
    x = "ESG ETF",
    y = "Standard ETF",
    title = "Relationship between ESG ETF and Standard ETF",
    caption = "Rachel Zhang"
  ) +
  theme_classic()
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

1.5.1.3 Homework problem 2: exploring the outliers

We’re going change our data from wide to long. Don’t worry too much about understanding this now, as we’ll cover it in detail in the coming weeks. Just know that it will enable us to compare our two ETFs in new ways.

library(tidyverse)
library(esquisse)
library(ggplot2)

# the URL of our data on GitHub
github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"


# read the data from GitHub
blackrock_esg_vs_non_esg_etf <- github_url |> 
  read_csv() |> 
  # select the four columns we will use in our anlaysis here
  select(company_name:standard_etf)
Rows: 537 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ticker, company_name, sector, esg_uw_ow
dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, e...
lgl (3): in_esg_only, in_standard_only, in_on_index_only

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_esg_vs_non_esg_etf_long <- blackrock_esg_vs_non_esg_etf |> 
  # we'll learn a lot more about long data & pivot_longer() in future weeks. 
  pivot_longer(cols = contains("etf"), names_to = "fund_type", values_to = "weight") |> 
  # case_when() is like an extended "if else"
  mutate(fund_type = case_when(fund_type == "esg_etf" ~ "ESG ETF (ESGU)",
                               fund_type == "standard_etf" ~ "Standard ETF (IVV)"))

blackrock_esg_vs_non_esg_etf_long
# A tibble: 1,074 × 4
   company_name                  sector                 fund_type         weight
   <chr>                         <chr>                  <chr>              <dbl>
 1 PRUDENTIAL FINANCIAL INC      Financials             ESG ETF (ESGU)    0.537 
 2 PRUDENTIAL FINANCIAL INC      Financials             Standard ETF (IV… 0.106 
 3 GENERAL MILLS INC             Consumer Staples       ESG ETF (ESGU)    0.552 
 4 GENERAL MILLS INC             Consumer Staples       Standard ETF (IV… 0.151 
 5 KELLOGG                       Consumer Staples       ESG ETF (ESGU)    0.453 
 6 KELLOGG                       Consumer Staples       Standard ETF (IV… 0.0592
 7 AUTOMATIC DATA PROCESSING INC Information Technology ESG ETF (ESGU)    0.649 
 8 AUTOMATIC DATA PROCESSING INC Information Technology Standard ETF (IV… 0.312 
 9 ECOLAB INC                    Materials              ESG ETF (ESGU)    0.441 
10 ECOLAB INC                    Materials              Standard ETF (IV… 0.118 
# ℹ 1,064 more rows
blackrock_esg_vs_non_esg_etf_long %>%
  #limit the weight variable to companies over a 1% weight.
 filter(weight >= 0.1 & weight <= 7) %>%
 ggplot() +
 aes(x = weight, y = company_name, colour = fund_type, size = weight) +
 geom_point(shape = "circle") +
  #Change the color for the ESG fund points to be green colored, and the non-esg fund to be grey
 scale_color_manual(values = c(`ESG ETF (ESGU)` = "#46DB98", `Standard ETF (IVV)` = "#9CAAA4"
)) +
  #Add meaningful titles and labels.
 labs(x = "Weight", y = "Company Name", subtitle = "Weight in ESG ETF and Standard ETF", caption = "Rachel Zhang") +
 theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

1.5.1.4 Homework problem 3: Make your own charts with esquisse

github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"


# read the data from GitHub
blackrock_esg_vs_non_esg_etf <- github_url |> 
  read_csv() |> 
  # select the four columns we will use in our anlaysis here
  select(company_name:standard_etf)
Rows: 537 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ticker, company_name, sector, esg_uw_ow
dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, e...
lgl (3): in_esg_only, in_standard_only, in_on_index_only

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_esg_vs_non_esg_etf_long <- blackrock_esg_vs_non_esg_etf |> 
  # we'll learn a lot more about long data & pivot_longer() in future weeks. 
  pivot_longer(cols = contains("etf"), names_to = "fund_type", values_to = "weight") |> 
  # case_when() is like an extended "if else"
  mutate(fund_type = case_when(fund_type == "esg_etf" ~ "ESG ETF (ESGU)",
                               fund_type == "standard_etf" ~ "Standard ETF (IVV)"))

blackrock_esg_vs_non_esg_etf_long
# A tibble: 1,074 × 4
   company_name                  sector                 fund_type         weight
   <chr>                         <chr>                  <chr>              <dbl>
 1 PRUDENTIAL FINANCIAL INC      Financials             ESG ETF (ESGU)    0.537 
 2 PRUDENTIAL FINANCIAL INC      Financials             Standard ETF (IV… 0.106 
 3 GENERAL MILLS INC             Consumer Staples       ESG ETF (ESGU)    0.552 
 4 GENERAL MILLS INC             Consumer Staples       Standard ETF (IV… 0.151 
 5 KELLOGG                       Consumer Staples       ESG ETF (ESGU)    0.453 
 6 KELLOGG                       Consumer Staples       Standard ETF (IV… 0.0592
 7 AUTOMATIC DATA PROCESSING INC Information Technology ESG ETF (ESGU)    0.649 
 8 AUTOMATIC DATA PROCESSING INC Information Technology Standard ETF (IV… 0.312 
 9 ECOLAB INC                    Materials              ESG ETF (ESGU)    0.441 
10 ECOLAB INC                    Materials              Standard ETF (IV… 0.118 
# ℹ 1,064 more rows
#Chart 1: Histogram for ESG ETF distribution in each sector
ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = sector, weight = standard_etf) +
  geom_bar(fill = "#228B22") +
  labs(x = "Sector", y = "ESG ETF", caption = "Rachel Zhang") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

#Chart 2: Histogram for Standard ETF distribution in each sector

ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = standard_etf, y = sector) +
  geom_boxplot(fill = "#B22222") +
  geom_jitter() +
  scale_x_continuous(trans = "log10") +
  labs(
    x = "Sector",
    y = "Standard ETF",
    title = "Standard ETF Distribution ",
    caption = "Rachel Zhang"
  ) +
  coord_flip() +
  theme_light() + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning: Transformation introduced infinite values in continuous x-axis
Transformation introduced infinite values in continuous x-axis
Warning: Removed 35 rows containing non-finite values (`stat_boxplot()`).

1.5.2.1 Homework problem 4: Understanding aes()

github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"


# read the data from GitHub
blackrock_esg_vs_non_esg_etf <- github_url |> 
  read_csv() |> 
  # select the four columns we will use in our anlaysis here
  select(company_name:standard_etf)
Rows: 537 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ticker, company_name, sector, esg_uw_ow
dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, e...
lgl (3): in_esg_only, in_standard_only, in_on_index_only

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blackrock_esg_vs_non_esg_etf_long <- blackrock_esg_vs_non_esg_etf |> 
  # we'll learn a lot more about long data & pivot_longer() in future weeks. 
  pivot_longer(cols = contains("etf"), names_to = "fund_type", values_to = "weight") |> 
  # case_when() is like an extended "if else"
  mutate(fund_type = case_when(fund_type == "esg_etf" ~ "ESG ETF (ESGU)",
                               fund_type == "standard_etf" ~ "Standard ETF (IVV)"))

blackrock_esg_vs_non_esg_etf_long
# A tibble: 1,074 × 4
   company_name                  sector                 fund_type         weight
   <chr>                         <chr>                  <chr>              <dbl>
 1 PRUDENTIAL FINANCIAL INC      Financials             ESG ETF (ESGU)    0.537 
 2 PRUDENTIAL FINANCIAL INC      Financials             Standard ETF (IV… 0.106 
 3 GENERAL MILLS INC             Consumer Staples       ESG ETF (ESGU)    0.552 
 4 GENERAL MILLS INC             Consumer Staples       Standard ETF (IV… 0.151 
 5 KELLOGG                       Consumer Staples       ESG ETF (ESGU)    0.453 
 6 KELLOGG                       Consumer Staples       Standard ETF (IV… 0.0592
 7 AUTOMATIC DATA PROCESSING INC Information Technology ESG ETF (ESGU)    0.649 
 8 AUTOMATIC DATA PROCESSING INC Information Technology Standard ETF (IV… 0.312 
 9 ECOLAB INC                    Materials              ESG ETF (ESGU)    0.441 
10 ECOLAB INC                    Materials              Standard ETF (IV… 0.118 
# ℹ 1,064 more rows
#chart 1

ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = esg_etf, y = standard_etf) +
  geom_point(aes(colour = sector), shape = "circle", size = 1.5) +
  geom_smooth(span = 0.91) +
  scale_color_hue(direction = 1) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

#Chart 2
ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = esg_etf, y = standard_etf, colour = sector) +
  geom_point(shape = "circle", size = 1.5) +
  geom_smooth(span = 0.91) +
  scale_color_hue(direction = 1) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

#Chart 3

ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = esg_etf, y = standard_etf) +
  geom_point(shape = "circle", size = 1.5, colour = "#7B1494") +
  geom_smooth(span = 0.75, color = "yellow") +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(x = "ESG ETF", y = "Standard ETF") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).