1. Setup and Data Import

Load Required Packages

# Install packages if needed (run once)
# install.packages(c("readxl", "tidyverse", "lubridate", "tidyquant", "writexl"))

library(tidyverse)
library(readxl)
library(writexl)
library(lubridate)
library(tidyquant)

Import Data

# Import Excel files
bikes_tbl <- read_excel("./bikes.xlsx")
bikeshops_tbl <- read_excel("./bikeshops.xlsx")
orderlines_tbl <- read_excel("./orderlines.xlsx")

# Examine the data structure
glimpse(bikes_tbl)
## Rows: 97
## Columns: 4
## $ bike.id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ model       <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Team", "Su…
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road - Carbon"…
## $ price       <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, 2240, 18…
glimpse(bikeshops_tbl)
## Rows: 30
## Columns: 3
## $ bikeshop.id   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ bikeshop.name <chr> "Pittsburgh Mountain Machines", "Ithaca Mountain Climber…
## $ location      <chr> "Pittsburgh, PA", "Ithaca, NY", "Columbus, OH", "Detroit…
glimpse(orderlines_tbl)
## Rows: 15,644
## Columns: 7
## $ ...1        <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "…
## $ order.id    <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7…
## $ order.line  <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2…
## $ order.date  <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-01-1…
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 16, 16, 1…
## $ product.id  <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 72, 45, 3…
## $ quantity    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…

2. Data Wrangling

Join Tables

# Join all three tables using the pipe operator
bike_orderlines_wrangled_tbl <- orderlines_tbl %>% 
  left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>% 
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id")) %>% 
  
  # Separate description into multiple columns
  separate(description, 
           into = c('category.1', 'category.2', 'frame.material'), 
           sep  = ' - ') %>% 
  
  # Separate location into city and state
  separate(location, 
           into = c('city', 'state'), 
           sep  = ', ',
           remove = FALSE) %>%
  
  # Calculate total price
  mutate(total.price = price * quantity) %>% 
  
  # Reorganize and rename columns
  select(-...1, -location) %>% 
  select(contains('date'), contains('id'), 
         contains('order'), 
         quantity, price, total.price, 
         everything()) %>% 
  rename(order_date = order.date) %>%
  set_names(names(.) %>% str_replace_all("\\.", "_"))

# Display first few rows
head(bike_orderlines_wrangled_tbl)
## # A tibble: 6 × 15
##   order_date          order_id customer_id product_id order_line quantity price
##   <dttm>                 <dbl>       <dbl>      <dbl>      <dbl>    <dbl> <dbl>
## 1 2011-01-07 00:00:00        1           2         48          1        1  6070
## 2 2011-01-07 00:00:00        1           2         52          2        1  5970
## 3 2011-01-10 00:00:00        2          10         76          1        1  2770
## 4 2011-01-10 00:00:00        2          10         52          2        1  5970
## 5 2011-01-10 00:00:00        3           6          2          1        1 10660
## 6 2011-01-10 00:00:00        3           6         50          2        1  3200
## # ℹ 8 more variables: total_price <dbl>, model <chr>, category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>, city <chr>,
## #   state <chr>
# Save cleaned data
saveRDS(bike_orderlines_wrangled_tbl, './bike_orderlines.rds')

3. Exploratory Data Analysis

Key dplyr Functions

Select and Filter Operations

# Calculate average total price using pull()
avg_price <- bike_orderlines_wrangled_tbl %>% 
  pull(total_price) %>% 
  mean()

cat("Average Order Total Price: $", round(avg_price, 2), "\n")
## Average Order Total Price: $ 4540.55
# Filter bikes above average price
bikes_tbl %>% 
  select(model, price) %>% 
  filter(price > mean(price)) %>% 
  arrange(desc(price)) %>%
  head(10)
## # A tibble: 10 × 2
##    model                          price
##    <chr>                          <dbl>
##  1 Supersix Evo Black Inc.        12790
##  2 Scalpel-Si Black Inc.          12790
##  3 Habit Hi-Mod Black Inc.        12250
##  4 F-Si Black Inc.                11190
##  5 Supersix Evo Hi-Mod Team       10660
##  6 Synapse Hi-Mod Disc Black Inc.  9590
##  7 Scalpel-Si Race                 9060
##  8 F-Si Hi-Mod Team                9060
##  9 Trigger Carbon 1                8200
## 10 Supersix Evo Hi-Mod Dura Ace 1  7990

Unique Categories

# Q1: What are the unique categories of products?
cat("Primary Categories:\n")
## Primary Categories:
bike_orderlines_wrangled_tbl %>% distinct(category_1) %>% pull()
## [1] "Mountain" "Road"
cat("\nSecondary Categories:\n")
## 
## Secondary Categories:
bike_orderlines_wrangled_tbl %>% distinct(category_2) %>% pull()
## [1] "Over Mountain"      "Trail"              "Elite Road"        
## [4] "Endurance Road"     "Sport"              "Cross Country Race"
## [7] "Cyclocross"         "Triathalon"         "Fat Bike"
cat("\nFrame Materials:\n")
## 
## Frame Materials:
bike_orderlines_wrangled_tbl %>% distinct(frame_material) %>% pull()
## [1] "Carbon"   "Aluminum"

Sales by Category

# Q2: Which product categories have the largest sales?
sales_by_category <- bike_orderlines_wrangled_tbl %>% 
  select(category_1, total_price) %>% 
  group_by(category_1) %>% 
  summarise(sales = sum(total_price)) %>%
  ungroup() %>% 
  arrange(desc(sales)) %>%
  rename(`Primary Category` = category_1, 
         Sales = sales) %>% 
  mutate(Sales_Formatted = scales::dollar(Sales))

knitr::kable(sales_by_category, 
             caption = "Total Sales by Primary Category",
             format.args = list(big.mark = ","))
Total Sales by Primary Category
Primary Category Sales Sales_Formatted
Mountain 39,154,735 $39,154,735
Road 31,877,595 $31,877,595

4. Time Series Analysis

Monthly Sales Aggregation

# Calculate monthly sales
bike_sales_m <- bike_orderlines_wrangled_tbl %>% 
  select(order_date, total_price) %>% 
  mutate(order_date = ymd(order_date)) %>% 
  mutate(year_month = floor_date(order_date, unit = "month")) %>%
  group_by(year_month) %>% 
  summarise(sales = sum(total_price)) %>%
  ungroup()

# Show first and last 6 months
head(bike_sales_m, 6)
## # A tibble: 6 × 2
##   year_month   sales
##   <date>       <dbl>
## 1 2011-01-01  483015
## 2 2011-02-01 1162075
## 3 2011-03-01  659975
## 4 2011-04-01 1827140
## 5 2011-05-01  844170
## 6 2011-06-01 1413445
tail(bike_sales_m, 6)
## # A tibble: 6 × 2
##   year_month   sales
##   <date>       <dbl>
## 1 2015-07-01 1166455
## 2 2015-08-01  955090
## 3 2015-09-01  767355
## 4 2015-10-01  949455
## 5 2015-11-01 1065885
## 6 2015-12-01  926070

5. Data Visualizations

Bar Chart: Revenue by Category

revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>% 
  select(category_2, total_price) %>% 
  group_by(category_2) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup()

revenue_by_category2_tbl %>% 
  mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>% 
  ggplot(aes(category_2, revenue)) +
  geom_col(fill = "#2c3e50") +
  coord_flip() +
  scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
  labs(
    title = "Revenue by Bike Category",
    x = "Category",
    y = "Revenue (Millions)"
  ) +
  theme_tq()

Scatter Plot: Order Value Analysis

order_value_tbl <- bike_orderlines_wrangled_tbl %>% 
  select(order_id, order_line, total_price, quantity) %>% 
  group_by(order_id) %>% 
  summarize(
    total_quantity = sum(quantity),
    total_price = sum(total_price)
  ) %>% 
  ungroup()

order_value_tbl %>% 
  ggplot(aes(x = total_quantity, y = total_price)) +
  geom_point(alpha = 0.5, size = 2, color = "#18BC9C") +
  geom_smooth(method = "lm", se = TRUE, color = "#2c3e50") +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Order Value vs. Quantity",
    subtitle = "Relationship between items ordered and total price",
    x = "Total Quantity",
    y = "Total Price"
  ) +
  theme_tq()

Histogram: Price Distribution

bike_orderlines_wrangled_tbl %>% 
  distinct(price, model, frame_material) %>% 
  ggplot(aes(price, fill = frame_material)) +
  geom_histogram(bins = 30, color = "white") + 
  facet_wrap(~ frame_material, ncol = 1) + 
  scale_fill_tq() + 
  scale_x_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Price Distribution by Frame Material",
    x = "Price",
    y = "Count",
    fill = "Frame Material"
  ) +
  theme_tq() +
  theme(legend.position = "none")

Box Plot: Price by Category

unit_price_by_cat2_tbl <- bike_orderlines_wrangled_tbl %>% 
  select(category_2, model, price) %>% 
  distinct() %>% 
  mutate(category_2 = as_factor(category_2) %>% fct_reorder(price))

unit_price_by_cat2_tbl %>% 
  ggplot(aes(category_2, price)) + 
  geom_boxplot(fill = "#18BC9C", alpha = 0.7) + 
  coord_flip() +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Price Distribution by Bike Category",
    x = "Category",
    y = "Price"
  ) +
  theme_tq()

Lollipop Chart: Top 10 Customers

n <- 10

top_customers <- bike_orderlines_wrangled_tbl %>% 
  select(bikeshop_name, total_price) %>% 
  mutate(bikeshop_name = as_factor(bikeshop_name) %>% fct_lump(n = n, w = total_price)) %>% 
  group_by(bikeshop_name) %>% 
  summarize(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  mutate(bikeshop_name = bikeshop_name %>% fct_reorder(revenue)) %>% 
  mutate(bikeshop_name = bikeshop_name %>% fct_relevel("Other", after = 0)) %>% 
  arrange(desc(bikeshop_name)) %>% 
  mutate(revenue_text = scales::dollar(revenue)) %>% 
  mutate(cum_pct = cumsum(revenue) / sum(revenue)) %>% 
  mutate(cum_pct_txt = scales::percent(cum_pct)) %>% 
  mutate(rank = row_number()) %>% 
  mutate(rank = case_when(
    rank == max(rank) ~ NA_integer_, 
    TRUE ~ rank
  )) %>% 
  mutate(label_text = str_glue("Rank: {rank}\nRev: {revenue_text}\nCum: {cum_pct_txt}"))

top_customers %>% 
  ggplot(aes(revenue, bikeshop_name)) +
  geom_segment(aes(xend = 0, yend = bikeshop_name), color = "gray50") +
  geom_point(aes(size = revenue), color = "#18BC9C") + 
  geom_label(aes(label = label_text), hjust = "inward", size = 3) +
  scale_x_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
  labs(
    title = "Top 10 Customers by Revenue",
    subtitle = "With cumulative percentage contribution",
    x = "Revenue (Millions)",
    y = "Customer"
  ) +
  theme_tq() +
  theme(legend.position = "none")

Heatmap: Customer Purchasing Patterns

pct_sales_by_customer_tbl <- bike_orderlines_wrangled_tbl %>% 
  select(bikeshop_name, category_1, category_2, quantity) %>% 
  group_by(bikeshop_name, category_1, category_2) %>% 
  summarise(total_qty = sum(quantity)) %>% 
  ungroup() %>% 
  group_by(bikeshop_name) %>% 
  mutate(pct = total_qty / sum(total_qty)) %>% 
  ungroup() %>% 
  mutate(bikeshop_name = as.factor(bikeshop_name) %>% fct_rev())

pct_sales_by_customer_tbl %>% 
  ggplot(aes(category_2, bikeshop_name)) +
  geom_tile(aes(fill = pct)) +
  geom_text(aes(label = scales::percent(pct, accuracy = 1)), size = 2.5) +
  facet_wrap(~ category_1, scales = "free_x") +
  scale_fill_gradient(low = "white", high = "#2c3e50") +
  labs(
    title = "Heatmap of Customer Purchasing Habits", 
    subtitle = "Percentage of purchases by category",
    x = "Bike Type (Category 2)", 
    y = "Customer",
    fill = "% of Sales"
  ) + 
  theme_tq() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8), 
    plot.title = element_text(face = "bold")
  )

6. Feature Engineering

Custom Function: Separate Bike Model

separate_bike_model <- function(data, append = TRUE) {
  if (!append) {
    data <- data %>% select(model)
  }
  
  output_tbl <- data %>% 
    select(model) %>% 
    # Fix typos
    mutate(model = case_when(
      model == "CAAD Disc Ultegra" ~ "CAAD12 Disc Ultegra", 
      model == "Supersix Evo Hi-Mod Utegra" ~ "Supersix Evo Hi-Mod Ultegra",
      model == "Syapse Carbon Tiagra" ~ "Synapse Carbon Tiagra", 
      TRUE ~ model
    )) %>% 
    # Separate by spaces
    separate(col = model, 
             into = str_c("model_", 1:7), 
             sep = " ", 
             remove = FALSE, 
             fill = "right") %>% 
    # Extract base model
    mutate(model_base = case_when(
      str_detect(str_to_lower(model_1), "supersix") ~ str_c(model_1, model_2, sep = " "),
      str_detect(str_to_lower(model_1), "beast") ~ str_c(model_1, model_2, model_3, model_4, sep = " "),
      str_detect(str_to_lower(model_1), "bad") ~ str_c(model_1, model_2, sep = " "), 
      str_detect(str_to_lower(model_1), "fat") ~ str_c(model_1, model_2, sep = " "), 
      str_detect(str_to_lower(model_1), "29") ~ str_c(model_1, model_2, sep = " "),
      TRUE ~ model_1
    )) %>% 
    # Extract tier
    mutate(model_tier = model %>% str_replace(model_base, replacement = "") %>% str_trim()) %>% 
    select(-matches("model_[0-9]")) %>% 
    # Create feature flags
    mutate(
      black = model_tier %>% str_to_lower() %>% str_detect("black") %>% as.numeric(),
      red = model_tier %>% str_to_lower() %>% str_detect("red") %>% as.numeric(),
      hi_mod = model_tier %>% str_to_lower() %>% str_detect("hi_mod") %>% as.numeric(),
      team = model_tier %>% str_to_lower() %>% str_detect("team") %>% as.numeric(),
      ultegra = model_tier %>% str_to_lower() %>% str_detect("ultegra") %>% as.numeric(),
      dura_ace = model_tier %>% str_to_lower() %>% str_detect("dura_ace") %>% as.numeric(),
      disc = model_tier %>% str_to_lower() %>% str_detect("disc") %>% as.numeric()
    )
  
  return(output_tbl)
}

# Apply the function
bikes_enhanced <- separate_bike_model(bikes_tbl, append = TRUE)

# Display sample
head(bikes_enhanced) %>% 
  select(model, model_base, model_tier, black, ultegra, disc) %>%
  knitr::kable(caption = "Sample of Enhanced Bike Features")
Sample of Enhanced Bike Features
model model_base model_tier black ultegra disc
Supersix Evo Black Inc. Supersix Evo Black Inc. 1 0 0
Supersix Evo Hi-Mod Team Supersix Evo Hi-Mod Team 0 0 0
Supersix Evo Hi-Mod Dura Ace 1 Supersix Evo Hi-Mod Dura Ace 1 0 0 0
Supersix Evo Hi-Mod Dura Ace 2 Supersix Evo Hi-Mod Dura Ace 2 0 0 0
Supersix Evo Hi-Mod Ultegra Supersix Evo Hi-Mod Ultegra 0 1 0
Supersix Evo Red Supersix Evo Red 0 0 0

7. Summary and Insights

Key Findings

cat("=== ANALYSIS SUMMARY ===\n\n")
## === ANALYSIS SUMMARY ===
cat("Total Revenue:", scales::dollar(sum(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Total Revenue: $71,032,330
cat("Total Orders:", n_distinct(bike_orderlines_wrangled_tbl$order_id), "\n")
## Total Orders: 2000
cat("Total Customers:", n_distinct(bike_orderlines_wrangled_tbl$bikeshop_name), "\n")
## Total Customers: 30
cat("Average Order Value:", scales::dollar(mean(order_value_tbl$total_price)), "\n\n")
## Average Order Value: $35,516.16
cat("Top Category:", sales_by_category$`Primary Category`[1], "\n")
## Top Category: Mountain
cat("Sales Growth (2011-2015):", 
    bike_sales_y_analysis$pct_diff_1_chr[nrow(bike_sales_y_analysis)], "\n")
## Sales Growth (2011-2015): 23.3%

Report Generated: 2025-12-16
Analysis Period: 2011-2015
Data Source: Bike Sales Transaction Data