Data Visualization Practice Problems

Author

Yuanling Zeng

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# the URL of our data on GitHub
github_url <- "https://raw.githubusercontent.com/t-emery/sais-susfin_data/main/datasets/etf_comparison-2022-10-03.csv"

# read the data from GitHub
blackrock_esg_vs_non_esg_etf <- github_url |> 
  read_csv() |> 
  # select the four columns we will use in our anlaysis here
  select(company_name:standard_etf)
Rows: 537 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ticker, company_name, sector, esg_uw_ow
dbl (7): esg_etf, standard_etf, esg_tilt, esg_tilt_z_score, esg_tilt_rank, e...
lgl (3): in_esg_only, in_standard_only, in_on_index_only

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Problem 1: Recreate the chart above in Esquisse

library(esquisse)
library(ggplot2)
ggplot(blackrock_esg_vs_non_esg_etf) +
  aes(x = esg_etf, y = standard_etf, colour = sector) +
  geom_point(shape = "circle", size = 1.5) +
  geom_smooth(span = 0.75) +
  scale_color_viridis_d(option = "inferno", direction = 1) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(
    x = "ESG ETF (ESGU)",
    y = "Standard ETF (IVV)",
    title = "ESG Fund vs. Non-ESG Fund",
    subtitle = "by using Esquisse",
    caption = "Yuanling"
  ) +
  theme_minimal() +
  facet_wrap(vars(sector))
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

Problem 2: Exploring the outliers

blackrock_esg_vs_non_esg_etf_long <- blackrock_esg_vs_non_esg_etf |> 
  # we'll learn a lot more about long data & pivot_longer() in future weeks. 
  pivot_longer(cols = contains("etf"), names_to = "fund_type", values_to = "weight") |> 
  # case_when() is like an extended "if else"
  mutate(fund_type = case_when(fund_type == "esg_etf" ~ "ESG ETF (ESGU)",
                               fund_type == "standard_etf" ~ "Standard ETF (IVV)"))

blackrock_esg_vs_non_esg_etf_long
# A tibble: 1,074 × 4
   company_name                  sector                 fund_type         weight
   <chr>                         <chr>                  <chr>              <dbl>
 1 PRUDENTIAL FINANCIAL INC      Financials             ESG ETF (ESGU)    0.537 
 2 PRUDENTIAL FINANCIAL INC      Financials             Standard ETF (IV… 0.106 
 3 GENERAL MILLS INC             Consumer Staples       ESG ETF (ESGU)    0.552 
 4 GENERAL MILLS INC             Consumer Staples       Standard ETF (IV… 0.151 
 5 KELLOGG                       Consumer Staples       ESG ETF (ESGU)    0.453 
 6 KELLOGG                       Consumer Staples       Standard ETF (IV… 0.0592
 7 AUTOMATIC DATA PROCESSING INC Information Technology ESG ETF (ESGU)    0.649 
 8 AUTOMATIC DATA PROCESSING INC Information Technology Standard ETF (IV… 0.312 
 9 ECOLAB INC                    Materials              ESG ETF (ESGU)    0.441 
10 ECOLAB INC                    Materials              Standard ETF (IV… 0.118 
# ℹ 1,064 more rows
blackrock_esg_vs_non_esg_etf_long %>%
# limit the weight variable to companies over a 1% weight
  filter(weight >= 1L & weight <= 7L) %>%
  ggplot() +
# Choose a Point chart and assign variables to aesthetics 
# assign variables to x and y axes
  aes(x = weight,y = company_name,
    colour = fund_type,
    size = weight
  ) +
  geom_point(shape = "circle") +
# Change the color for the ESG fund points to be green colored, and the non-esg fund to be grey.
  scale_color_manual(
    values = c(`ESG ETF (ESGU)` = "#00C19F",
               `Standard ETF (IVV)` = "#807C7C")
  ) +
# name axes, adds title and the caption
  labs(x = "Weight",
       y = "Company Name ",
       title = "ESG Fund vs. Non ESG Fund",
       caption = "Yuanling"
  ) +
  theme_minimal()

# Reflection: 
# Most companies have similar and smaller weightings in ESG and non-ESG ETFs, with a few companies such as Procter & Gamble and Johnson & Johnson having only one fund of the standard EFT. 
# Technology companies like Tesla, Microsoft, Apple and Amazon are having relatively large weightings in both funds, especially Apple is taking the largest weighting.

Problem 3: Make your own charts with esquisse

Chart 1

ggplot(blackrock_esg_vs_non_esg_etf) +
# assign variables to x and y axes
  aes(x = esg_etf, y = sector) +
# change color of columns
 geom_col(fill = "#E89BC1") +
# name axes, adds title and the caption
 labs(x = "Number of ESG ETFs", 
      y = "Sector", 
      title = "Number of ESG ETFs in each sector",
      caption = "Yuanling") +
 theme_minimal()

# The chart reflects the number of ESG ETFs by sector, and as can be seen from the chart, the Information Technology sector has far more ESG ETFs than any other sector, while the Materials sector has the fewest ESG ETFs.

Chart 2

ggplot(blackrock_esg_vs_non_esg_etf) +
# assign variables to x and y axes
  aes(x = standard_etf, y = sector) +
# change color of columns
  geom_col(fill = "#6FB1E8") +
# name axes, adds title and the caption
  labs(x = "Number of Standard ETFs", 
       y = "Sector", 
       title = "Number of Standard ETFs in each sector",
       caption = "Yuanling") +
  theme_minimal()

# This chart shows the number of standard ETFs by industry. 
# Similar to the conclusion drawn from the previous chart, companies in the Information Technology sector also have the most standard ETFs, 
# which is similar to the conclusion expressed in the chart for the second question, that technology companies will have a higher weighting in both fund categories. 
# The Materials sector has the fewest ESG ETFs, meaning that companies in the Materials sector own fewer ETFs than the other sectors, regardless of fund category.

Problem 4: Understanding aes()

Chart 1

ggplot(blackrock_esg_vs_non_esg_etf) +
# assign variables to x and y axes
  aes(x = esg_etf, y = standard_etf) +
# set scatter plot 
  geom_point(aes(colour = sector), shape = "circle", size = 1.5) +
# generate fit line 
  geom_smooth(span = 0.75) +
  scale_color_hue(direction = 1) +
# add a line to adjust the color of the data points or lines in a plot based on the levels of a categorical variable
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

Chart 2

ggplot(blackrock_esg_vs_non_esg_etf) +
# interchange x and y variables
  aes(x = standard_etf, y = esg_etf, colour = sector) +
# set scatter plot 
  geom_point(shape = "circle", size = 1.5) +
# generate fit line 
  geom_smooth(span = 0.75) +
  scale_color_hue(direction = 1) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).

Chart 3

ggplot(blackrock_esg_vs_non_esg_etf) +
# interchange x and y variables
  aes(x = esg_etf, y = standard_etf) +
# set scatter plot 
  geom_point(shape = "circle", size = 1.5, colour = "purple") +
# generate fit line but did not color fit line in yellow
  geom_smooth(span = 0.75) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
# name axes
  labs(x = "ESG ETF (ESGU)", y = "Standard ETF (IVV)") +
  theme_minimal()
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 261 rows containing non-finite values (`stat_smooth()`).