Step 1: Load Required Libraries

library(tidyverse)
library(readxl)
library(scales)

Step 2: Import Data

# Option 1: If your data files are on Desktop
bikes_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/bikes (1).xlsx")
bikeshops_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/bikeshops.xlsx")
orderlines_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/orderlines.xlsx")

# Option 2: If files are in the same folder as your .Rmd file
# bikes_tbl <- read_excel("bikes.xlsx")
# bikeshops_tbl <- read_excel("bikeshops.xlsx")
# orderlines_tbl <- read_excel("orderlines.xlsx")

Step 3: Join the Data

# Join all three tables together
bike_orderlines_joined_tbl <- orderlines_tbl %>%
  left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

# Check the joined data
head(bike_orderlines_joined_tbl)
## # A tibble: 6 × 12
##   ...1  order.id order.line order.date          customer.id product.id quantity
##   <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>    <dbl>
## 1 1            1          1 2011-01-07 00:00:00           2         48        1
## 2 2            1          2 2011-01-07 00:00:00           2         52        1
## 3 3            2          1 2011-01-10 00:00:00          10         76        1
## 4 4            2          2 2011-01-10 00:00:00          10         52        1
## 5 5            3          1 2011-01-10 00:00:00           6          2        1
## 6 6            3          2 2011-01-10 00:00:00           6         50        1
## # ℹ 5 more variables: model <chr>, description <chr>, price <dbl>,
## #   bikeshop.name <chr>, location <chr>

Step 4: Wrangle the Data

# Separate description column and create calculated columns
bike_orderlines_wrangled_tbl <- bike_orderlines_joined_tbl %>%
  # Separate description into three parts
  separate(description,
           into = c("category_1", "category_2", "frame_material"),
           sep = " - ") %>%
  # Separate location into city and state
  separate(location,
           into = c("city", "state"),
           sep = ", ",
           remove = FALSE) %>%
  # Create total price column
  mutate(total_price = price * quantity) %>%
  # Remove unnecessary columns
  select(-...1, -location) %>%
  # Rename columns to use underscores instead of dots
  set_names(names(.) %>% str_replace_all("\\.", "_"))

# View the wrangled data
head(bike_orderlines_wrangled_tbl)
## # A tibble: 6 × 15
##   order_id order_line order_date          customer_id product_id quantity model 
##      <dbl>      <dbl> <dttm>                    <dbl>      <dbl>    <dbl> <chr> 
## 1        1          1 2011-01-07 00:00:00           2         48        1 Jekyl…
## 2        1          2 2011-01-07 00:00:00           2         52        1 Trigg…
## 3        2          1 2011-01-10 00:00:00          10         76        1 Beast…
## 4        2          2 2011-01-10 00:00:00          10         52        1 Trigg…
## 5        3          1 2011-01-10 00:00:00           6          2        1 Super…
## 6        3          2 2011-01-10 00:00:00           6         50        1 Jekyl…
## # ℹ 8 more variables: category_1 <chr>, category_2 <chr>, frame_material <chr>,
## #   price <dbl>, bikeshop_name <chr>, city <chr>, state <chr>,
## #   total_price <dbl>

Step 5: Calculate Revenue by Category 2

# Group by category_2 and sum the total_price
revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>%
  select(category_2, total_price) %>%
  group_by(category_2) %>%
  summarise(revenue = sum(total_price)) %>%
  ungroup()

# Display the revenue table
revenue_by_category2_tbl
## # A tibble: 9 × 2
##   category_2          revenue
##   <chr>                 <dbl>
## 1 Cross Country Race 19224630
## 2 Cyclocross          2108120
## 3 Elite Road         15334665
## 4 Endurance Road     10381060
## 5 Fat Bike            1052620
## 6 Over Mountain       7571270
## 7 Sport               1932755
## 8 Trail               9373460
## 9 Triathalon          4053750

Step 6: Create the Bar Chart

# Create horizontal bar chart
revenue_by_category2_tbl %>%
  # Reorder categories by revenue (descending)
  mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
  # Create plot
  ggplot(aes(x = category_2, y = revenue)) +
  geom_col(fill = "blue") +
  # Flip coordinates to make horizontal bars
  coord_flip() +
  # Format y-axis labels as numbers with scientific notation
  scale_y_continuous(labels = label_number(scale = 1e-6, suffix = "e+06")) +
  # Add labels
  labs(
    title = "Revenue by Product Category",
    x = "category_2",
    y = "revenue"
  ) +
  # Use minimal theme
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )

Summary Statistics

# Show top 5 categories by revenue
revenue_by_category2_tbl %>%
  arrange(desc(revenue)) %>%
  mutate(revenue_formatted = dollar(revenue)) %>%
  head(5)
## # A tibble: 5 × 3
##   category_2          revenue revenue_formatted
##   <chr>                 <dbl> <chr>            
## 1 Cross Country Race 19224630 $19,224,630      
## 2 Elite Road         15334665 $15,334,665      
## 3 Endurance Road     10381060 $10,381,060      
## 4 Trail               9373460 $9,373,460       
## 5 Over Mountain       7571270 $7,571,270
# Total revenue across all categories
total_revenue <- sum(revenue_by_category2_tbl$revenue)
cat("Total Revenue:", dollar(total_revenue), "\n")
## Total Revenue: $71,032,330

```