library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(writexl)
# Import Excel files
bikes_tbl <- read_excel("bikes.xlsx")
bikeshops_tbl <- read_excel("bikeshops.xlsx")
orderlines_tbl <- read_excel("orderlines.xlsx")
## New names:
## • `` -> `...1`
# Import CSV file
bike_orderlines_tbl <- read_csv("bike_orderlines.csv")
## Rows: 15644 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): model, category_1, category_2, frame_material, bikeshop_name, city...
## dbl (5): order_id, order_line, quantity, price, total_price
## dttm (1): order_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join data
bike_orderlines_bikeshops_joined <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
# Wrangle data
bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>%
separate(description, into = c("category_1", "category_2", "frame_material"), sep = " - ") %>%
separate(location, into = c("city", "state"), sep = ", ", remove = FALSE) %>%
mutate(total_price = price * quantity) %>%
select(-...1, -location) %>%
rename(order_date = order.date) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
# Rename columns
bike_orderlines_wrangled_tbl <- bike_orderlines_wrangled_tbl %>%
rename(`Prime Category` = category_1,
`Secondary Category` = category_2,
`Frame Material` = frame_material)
# Create summary table
category_sales_tbl <- bike_orderlines_wrangled_tbl %>%
distinct(`Prime Category`, `Secondary Category`, `Frame Material`, .keep_all = TRUE) %>%
mutate(Sales = scales::dollar(total_price)) %>%
select(`Prime Category`, `Secondary Category`, `Frame Material`, Sales)
print(category_sales_tbl)
## # A tibble: 13 × 4
## `Prime Category` `Secondary Category` `Frame Material` Sales
## <chr> <chr> <chr> <chr>
## 1 Mountain Over Mountain Carbon $6,070
## 2 Mountain Trail Aluminum $2,770
## 3 Road Elite Road Carbon $10,660
## 4 Road Endurance Road Aluminum $1,570
## 5 Road Endurance Road Carbon $4,800
## 6 Mountain Sport Aluminum $480
## 7 Mountain Cross Country Race Carbon $22,380
## 8 Mountain Cross Country Race Aluminum $2,060
## 9 Road Cyclocross Carbon $1,960
## 10 Road Triathalon Carbon $4,500
## 11 Road Elite Road Aluminum $2,240
## 12 Mountain Trail Carbon $8,960
## 13 Mountain Fat Bike Aluminum $3,730