Data Import Practice Problems

Author

Yuanling Zeng

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(here)
here() starts at /Users/zengyuanling/Desktop
library(conflicted)
conflict_prefer("filter", "dplyr")
[conflicted] Will prefer dplyr::filter over any other package.

Problem 1:

Your boss knows that they will be asking her about how the implications of the electric vehicle (EV) market and policy choices on the demand for copper and cobalt.

1. Use the workflow developed in this chapter to import and tidy worksheet 2.3 EV from the dataset.

path_to_sheet <- "/Users/zengyuanling/Desktop/Sus Fin/CM_Data_Explorer.xlsx"
read_EV_sheet <- partial(
  .f = read_excel,
  path = path_to_sheet,
  sheet = "2.3 EV",
  col_names = FALSE
)
sheet_header <- read_EV_sheet(range = "A4:W5")
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
sheet_header
# A tibble: 2 × 23
  ...1   ...2 ...3  ...4    ...5  ...6  ...7  ...8  ...9 ...10 ...11 ...12 ...13
  <lgl> <dbl> <lgl> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <chr> <dbl> <dbl>
1 NA       NA NA    State…    NA    NA    NA    NA    NA NA    Anno…    NA    NA
2 NA     2022 NA    2025    2030  2035  2040  2045  2050 NA    2025   2030  2035
# ℹ 10 more variables: ...14 <dbl>, ...15 <dbl>, ...16 <dbl>, ...17 <lgl>,
#   ...18 <chr>, ...19 <dbl>, ...20 <dbl>, ...21 <dbl>, ...22 <dbl>,
#   ...23 <dbl>
sheet_header_processed <- sheet_header |> 
  t() |>
  as_tibble() |>
  rename(scenario = V1, year = V2) |>
  fill(scenario) |>
  replace_na(list(scenario = "Current Year"))
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
sheet_header_processed
# A tibble: 23 × 2
   scenario                 year 
   <chr>                    <chr>
 1 Current Year             <NA> 
 2 Current Year             2022 
 3 Current Year             <NA> 
 4 Stated policies scenario 2025 
 5 Stated policies scenario 2030 
 6 Stated policies scenario 2035 
 7 Stated policies scenario 2040 
 8 Stated policies scenario 2045 
 9 Stated policies scenario 2050 
10 Stated policies scenario <NA> 
# ℹ 13 more rows
EV_name <- read_EV_sheet(range = "A7") |> 
  pull()
New names:
• `` -> `...1`
EV_name
[1] "Constrained nickel supply"
EV_info <- read_EV_sheet(range = "A8:W18")
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
EV_info
# A tibble: 11 × 23
   ...1         ...2 ...3     ...4    ...5    ...6    ...7    ...8    ...9 ...10
   <chr>       <dbl> <lgl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <lgl>
 1 Copper    3.81e+2 NA    617.    1.39e+3 1.74e+3 2.23e+3 2.42e+3 2.31e+3 NA   
 2 Cobalt    6.35e+1 NA     61.2   3.80e+1 2.91e+1 3.54e+1 4.09e+1 4.74e+1 NA   
 3 Graphite  5.57e+2 NA    936.    1.59e+3 1.78e+3 1.69e+3 1.49e+3 1.08e+3 NA   
 4 Lithium   6.96e+1 NA    123.    2.19e+2 3.07e+2 4.07e+2 4.50e+2 4.13e+2 NA   
 5 Manganese 7.45e+1 NA     53.9   1.56e+2 3.36e+2 5.33e+2 7.12e+2 8.58e+2 NA   
 6 Nickel    3.13e+2 NA    554.    5.13e+2 5.89e+2 6.30e+2 6.92e+2 6.88e+2 NA   
 7 Silicon   8.70e+0 NA     40.5   1.39e+2 2.24e+2 3.52e+2 3.74e+2 3.73e+2 NA   
 8 Neodymium 3.96e+0 NA      7.25  1.22e+1 1.51e+1 1.88e+1 2.18e+1 2.29e+1 NA   
 9 Dysprosi… 4.13e-1 NA      0.741 1.21e+0 1.48e+0 1.84e+0 2.13e+0 2.23e+0 NA   
10 Praseody… 5.94e-1 NA      1.09  1.83e+0 2.27e+0 2.83e+0 3.27e+0 3.43e+0 NA   
11 Terbium   8.06e-2 NA      0.148 2.52e-1 3.14e-1 3.91e-1 4.52e-1 4.74e-1 NA   
# ℹ 13 more variables: ...11 <dbl>, ...12 <dbl>, ...13 <dbl>, ...14 <dbl>,
#   ...15 <dbl>, ...16 <dbl>, ...17 <lgl>, ...18 <dbl>, ...19 <dbl>,
#   ...20 <dbl>, ...21 <dbl>, ...22 <dbl>, ...23 <dbl>
EV_info_col_names <- names(EV_info)

sheet_headers_and_col_names <- sheet_header_processed |> 
  add_column(EV_info_col_names = EV_info_col_names)

EV_info_long <- EV_info |> 
  rename(indicator = `...1`) |> 
  pivot_longer(cols = -indicator,
               names_to = "EV_info_col_names") |> 
  add_column(EV_name)

combined_data <- EV_info_long |> 
  left_join(sheet_headers_and_col_names, by = join_by(EV_info_col_names)) |> 
  filter(!is.na(year)) |> 
    mutate(year = as.integer(year)) |> 
  select(EV_name, indicator, scenario, year, value)
combined_data
# A tibble: 209 × 5
   EV_name                   indicator scenario                    year value
   <chr>                     <chr>     <chr>                      <int> <dbl>
 1 Constrained nickel supply Copper    Current Year                2022  381.
 2 Constrained nickel supply Copper    Stated policies scenario    2025  617.
 3 Constrained nickel supply Copper    Stated policies scenario    2030 1389.
 4 Constrained nickel supply Copper    Stated policies scenario    2035 1736.
 5 Constrained nickel supply Copper    Stated policies scenario    2040 2233.
 6 Constrained nickel supply Copper    Stated policies scenario    2045 2418.
 7 Constrained nickel supply Copper    Stated policies scenario    2050 2313.
 8 Constrained nickel supply Copper    Announced pledges scenario  2025  732.
 9 Constrained nickel supply Copper    Announced pledges scenario  2030 2113.
10 Constrained nickel supply Copper    Announced pledges scenario  2035 3587.
# ℹ 199 more rows
read_EV_table <-
  function(EV_name_range, EV_info_range) {
    EV_name <-
      read_EV_sheet(range = EV_name_range) |>
      pull()
    
    EV_info <- read_EV_sheet(range = EV_info_range)
    EV_info_col_names <- names(EV_info)
    EV_info_long <- EV_info |>
      rename(indicator = `...1`) |>
      pivot_longer(cols = -indicator,
                   names_to = "EV_info_col_names") |>
      add_column(EV_name)
    
    combined_data <- EV_info_long |>
      left_join(sheet_headers_and_col_names, by = join_by(EV_info_col_names)) |>
      filter(!is.na(year)) |>
      mutate(
        year = as.integer(year)
      ) |>
      select(EV_name, indicator, scenario, year, value)
    combined_data
  }

nickel_supply_table <- read_EV_table(
  EV_name_range = "A7",
  EV_info_range = "A8:W19"
)
New names:
New names:
• `` -> `...1`
anodes_use_table <- read_EV_table(
  EV_name_range = "A22",
  EV_info_range = "A23:W24"
)
New names:
New names:
• `` -> `...1`
battery_upstake_table <- read_EV_table(
  EV_name_range = "A37",
  EV_info_range = "A38:W39"
)
New names:
New names:
• `` -> `...1`
battery_size_table <- read_EV_table(
  EV_name_range = "A52",
  EV_info_range = "A53:W54"
)
New names:
New names:
• `` -> `...1`
battery_size_reduction_table <- read_EV_table(
  EV_name_range = "A67",
  EV_info_range = "A68:W79"
)
New names:
New names:
• `` -> `...1`
base_case_table <- read_EV_table(
  EV_name_range = "A82",
  EV_info_range = "A83:W84"
)
New names:
New names:
• `` -> `...1`
final_EV_table <- nickel_supply_table |> 
  bind_rows(anodes_use_table) |> 
  bind_rows(battery_upstake_table) |>
  bind_rows(battery_size_table) |>
  bind_rows(battery_size_reduction_table) |>
  bind_rows(base_case_table)
final_EV_table
# A tibble: 608 × 5
   EV_name                   indicator scenario                    year value
   <chr>                     <chr>     <chr>                      <int> <dbl>
 1 Constrained nickel supply Copper    Current Year                2022  381.
 2 Constrained nickel supply Copper    Stated policies scenario    2025  617.
 3 Constrained nickel supply Copper    Stated policies scenario    2030 1389.
 4 Constrained nickel supply Copper    Stated policies scenario    2035 1736.
 5 Constrained nickel supply Copper    Stated policies scenario    2040 2233.
 6 Constrained nickel supply Copper    Stated policies scenario    2045 2418.
 7 Constrained nickel supply Copper    Stated policies scenario    2050 2313.
 8 Constrained nickel supply Copper    Announced pledges scenario  2025  732.
 9 Constrained nickel supply Copper    Announced pledges scenario  2030 2113.
10 Constrained nickel supply Copper    Announced pledges scenario  2035 3587.
# ℹ 598 more rows

2. Use the tidied dataset to come up with 5 compelling data visualizations that illustrate key actionable insights about how policy scenarios, and technological scenarios will impact demand for copper and cobalt.

data visualization 1: bar chart of Copper and Cobalt Demand In Different Scenarios and Time

cobalt <- combined_data[combined_data$indicator == "Cobalt", ]
copper <- combined_data[combined_data$indicator == "Copper", ]

combined_metals <- rbind(
  transform(cobalt, metal = "Cobalt"),
  transform(copper, metal = "Copper")
)
ggplot(combined_metals, aes(x = year, y = value, fill = scenario)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ metal, scales = "free_y") +
  labs(title = "Copper and Cobalt Demand In Different Scenarios and Time", x = "Year", y = "Demand (kiloton)", fill = "Scenario") +
  theme_minimal()

data visualization 2: bar chart of Copper and Cobalt Demand Comparison Over Time

copper_cobalt_data <- final_EV_table %>%
  filter(indicator %in% c("Copper", "Cobalt"))

ggplot(copper_cobalt_data, aes(x = factor(year), y = value, fill = indicator)) +
  geom_bar(stat = "identity", position = position_dodge()) + 
  labs(title = "Copper and Cobalt Demand Comparison Over Time",
       x = "Year",
       y = "Demand (kiloton)",
       fill = "Metal") +
  theme_minimal()
Warning: Removed 24 rows containing missing values (`geom_bar()`).

data visualization 3: bubble chart of Copper Demand Distributions Under Different Scenarios

ggplot(final_EV_table, aes(x = value, y = scenario, fill = scenario)) +
  geom_point(alpha = 0.7, size = 3, shape = 21) +
  scale_fill_manual(values = c("Stated policies scenario" = "gold", 
                               "Net Zero Emissions by 2050 scenario" = "pink",
                               "Current Year" = "green3", 
                               "Announced pledges scenario" = "skyblue")) +
  labs(title = "Copper Demand Distributions Under Different Scenarios",
       x = "Copper Demand (kiloton)",
       y = "Scenario",
       fill = "Scenario") +
  theme_minimal()
Warning: Removed 144 rows containing missing values (`geom_point()`).

data visualization 4: line chart of Cobalt Demand Under Announced Pledges Scenario

cobalt_data <- combined_data[combined_data$scenario == "Announced pledges scenario" & 
                             combined_data$indicator == "Cobalt", ]
ggplot(cobalt_data, aes(x = year, y = value)) +
  geom_line(color = "blue") + 
  labs(title = "Cobalt Demand Under Announced Pledges Scenario",
       x = "Year",
       y = "Demand (kiloton)") +
  theme_minimal()

data visualization 5: line chart of Copper and Cobalt Demand Under Net Zero Emissions by 2050 Scenario

scenario_data <- combined_data[combined_data$scenario == "Net Zero Emissions by 2050 scenario", ]
metal_data <- scenario_data[scenario_data$indicator %in% c("Copper", "Cobalt"), ]
ggplot(metal_data, aes(x = year, y = value, color = indicator)) +
  geom_line(size = 1) +
  labs(title = "Copper and Cobalt Demand Under Net Zero Emissions by 2050 Scenario",
       x = "Year",
       y = "Demand (kiloton)",
       color = "Metal") +
  scale_color_manual(values = c("Copper" = "orange", "Cobalt" = "blue")) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.