1. Load Libraries

library(tidyverse)
library(readxl)
library(lubridate)
library(scales)

2. Data Import

# Import Excel files
bikes_tbl <- read_excel("./bikes.xlsx")
bikeshops_tbl <- read_excel("./bikeshops.xlsx")
orderlines_tbl <- read_excel("./orderlines.xlsx")

# Display data structure
glimpse(bikes_tbl)
## Rows: 97
## Columns: 4
## $ bike.id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ model       <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Team", "Su…
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road - Carbon"…
## $ price       <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, 2240, 18…
glimpse(bikeshops_tbl)
## Rows: 30
## Columns: 3
## $ bikeshop.id   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ bikeshop.name <chr> "Pittsburgh Mountain Machines", "Ithaca Mountain Climber…
## $ location      <chr> "Pittsburgh, PA", "Ithaca, NY", "Columbus, OH", "Detroit…
glimpse(orderlines_tbl)
## Rows: 15,644
## Columns: 7
## $ ...1        <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "…
## $ order.id    <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7…
## $ order.line  <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2…
## $ order.date  <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-01-1…
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 16, 16, 1…
## $ product.id  <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 72, 45, 3…
## $ quantity    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…

3. Data Joining

# Join all three tables
bike_orderlines_bikeshops_joined <- orderlines_tbl %>% 
  left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>% 
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

head(bike_orderlines_bikeshops_joined)
## # A tibble: 6 × 12
##   ...1  order.id order.line order.date          customer.id product.id quantity
##   <chr>    <dbl>      <dbl> <dttm>                    <dbl>      <dbl>    <dbl>
## 1 1            1          1 2011-01-07 00:00:00           2         48        1
## 2 2            1          2 2011-01-07 00:00:00           2         52        1
## 3 3            2          1 2011-01-10 00:00:00          10         76        1
## 4 4            2          2 2011-01-10 00:00:00          10         52        1
## 5 5            3          1 2011-01-10 00:00:00           6          2        1
## 6 6            3          2 2011-01-10 00:00:00           6         50        1
## # ℹ 5 more variables: model <chr>, description <chr>, price <dbl>,
## #   bikeshop.name <chr>, location <chr>

4. Data Wrangling

# Decompose description and location, create calculated columns
bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>% 
  separate(description, 
           into = c('category.1', 'category.2', 'frame.material'), 
           sep  = ' - ') %>% 
  separate(location, 
           into = c('city', 'state'), 
           sep  = ', ',
           remove = FALSE) %>% 
  # Create calculated columns 
  mutate(total.price = price * quantity) %>% 
  # Reorganize columns                                  
  select(-...1, -location) %>% 
  # Reorder columns                                  
  select(contains('date'), contains('id'), 
         contains('order'), 
         quantity, price, total.price, 
         everything()) %>% 
  # Rename columns
  rename(order_date = order.date) %>%
  set_names(names(.) %>% str_replace_all("\\.", "_"))

# Display cleaned data
head(bike_orderlines_wrangled_tbl)
## # A tibble: 6 × 15
##   order_date          order_id customer_id product_id order_line quantity price
##   <dttm>                 <dbl>       <dbl>      <dbl>      <dbl>    <dbl> <dbl>
## 1 2011-01-07 00:00:00        1           2         48          1        1  6070
## 2 2011-01-07 00:00:00        1           2         52          2        1  5970
## 3 2011-01-10 00:00:00        2          10         76          1        1  2770
## 4 2011-01-10 00:00:00        2          10         52          2        1  5970
## 5 2011-01-10 00:00:00        3           6          2          1        1 10660
## 6 2011-01-10 00:00:00        3           6         50          2        1  3200
## # ℹ 8 more variables: total_price <dbl>, model <chr>, category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>, city <chr>,
## #   state <chr>

5. Data Exploration

5.1 Unique Product Categories

cat("Category 1:\n")
## Category 1:
bike_orderlines_wrangled_tbl %>% distinct(category_1) %>% pull()
## [1] "Mountain" "Road"
cat("\nCategory 2:\n")
## 
## Category 2:
bike_orderlines_wrangled_tbl %>% distinct(category_2) %>% pull()
## [1] "Over Mountain"      "Trail"              "Elite Road"        
## [4] "Endurance Road"     "Sport"              "Cross Country Race"
## [7] "Cyclocross"         "Triathalon"         "Fat Bike"
cat("\nFrame Material:\n")
## 
## Frame Material:
bike_orderlines_wrangled_tbl %>% distinct(frame_material) %>% pull()
## [1] "Carbon"   "Aluminum"

5.2 Sales by Primary Category

bike_orderlines_wrangled_tbl %>% 
  group_by(category_1) %>% 
  summarise(Sales = sum(total_price)) %>%
  ungroup() %>% 
  arrange(desc(Sales)) %>% 
  rename(`Primary Category` = category_1) %>% 
  mutate(Sales_Formatted = dollar(Sales)) %>% 
  knitr::kable(format.args = list(big.mark = ","))
Primary Category Sales Sales_Formatted
Mountain 39,154,735 $39,154,735
Road 31,877,595 $31,877,595

5.3 Top Products by Price

bikes_tbl %>% 
  select(model, price) %>% 
  arrange(desc(price)) %>% 
  head(10) %>% 
  mutate(price = dollar(price)) %>% 
  knitr::kable()
model price
Supersix Evo Black Inc. $12,790
Scalpel-Si Black Inc. $12,790
Habit Hi-Mod Black Inc. $12,250
F-Si Black Inc. $11,190
Supersix Evo Hi-Mod Team $10,660
Synapse Hi-Mod Disc Black Inc. $9,590
Scalpel-Si Race $9,060
F-Si Hi-Mod Team $9,060
Trigger Carbon 1 $8,200
Supersix Evo Hi-Mod Dura Ace 1 $7,990

6. Time Series Analysis

6.1 Sales by Year

bike_sales_y <- bike_orderlines_wrangled_tbl %>%
  select(order_date, total_price) %>% 
  mutate(order_date = ymd(order_date)) %>% 
  mutate(year = year(order_date)) %>% 
  group_by(year) %>% 
  summarise(sales = sum(total_price)) %>% 
  ungroup()

bike_sales_y %>% 
  mutate(sales = dollar(sales)) %>% 
  knitr::kable()
year sales
2011 $11,292,885
2012 $12,163,075
2013 $16,480,775
2014 $13,924,085
2015 $17,171,510

6.2 Year-over-Year Growth

calculate_pct_diff <- function(data){
  data %>% 
    mutate(sales_lag_1 = lag(sales, n = 1)) %>% 
    mutate(sales_lag_1 = case_when(
      is.na(sales_lag_1) ~ sales,
      TRUE ~ sales_lag_1
    )) %>% 
    mutate(diff_1 = sales - sales_lag_1) %>% 
    mutate(pct_diff_1 = diff_1 / sales_lag_1) %>% 
    mutate(pct_diff_1_chr = percent(pct_diff_1))
}

bike_sales_y %>% 
  calculate_pct_diff() %>% 
  mutate(sales = dollar(sales),
         diff_1 = dollar(diff_1)) %>% 
  select(year, sales, diff_1, pct_diff_1_chr) %>% 
  rename(Year = year, 
         Sales = sales, 
         `Change ($)` = diff_1, 
         `Change (%)` = pct_diff_1_chr) %>% 
  knitr::kable()
Year Sales Change ($) Change (%)
2011 $11,292,885 $0 0.0%
2012 $12,163,075 $870,190 7.7%
2013 $16,480,775 $4,317,700 35.5%
2014 $13,924,085 -$2,556,690 -15.5%
2015 $17,171,510 $3,247,425 23.3%

6.3 Monthly Sales

bike_sales_m <- bike_orderlines_wrangled_tbl %>% 
  select(order_date, total_price) %>% 
  mutate(order_date = ymd(order_date)) %>% 
  mutate(year_month = floor_date(order_date, unit = "month")) %>%
  group_by(year_month) %>% 
  summarise(sales = sum(total_price))

head(bike_sales_m, 10) %>% 
  mutate(sales = dollar(sales)) %>% 
  knitr::kable()
year_month sales
2011-01-01 $483,015
2011-02-01 $1,162,075
2011-03-01 $659,975
2011-04-01 $1,827,140
2011-05-01 $844,170
2011-06-01 $1,413,445
2011-07-01 $1,194,430
2011-08-01 $679,790
2011-09-01 $814,720
2011-10-01 $734,920

7. Visualizations

7.1 Revenue Trend by Year

bike_sales_y %>% 
  ggplot(aes(x = year, y = sales, color = sales)) +
  geom_point(size = 5) + 
  geom_line(linewidth = 2) + 
  geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
  expand_limits(y = c(0, 20e6)) + 
  scale_colour_continuous(low = "red", high = "black", 
                          labels = dollar_format(scale = 1/1e6, suffix = "M")) +
  scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) + 
  labs(
    title = "Revenue Trend",
    subtitle = "Sales are trending up and to the right!",
    x = "Year", 
    y = "Sales (Millions)",
    color = "Rev ($M)",
    caption = "Total sales from 2011 to 2015"
  ) +
  theme_minimal()

7.2 Revenue by Category 2

revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>% 
  select(category_2, total_price) %>% 
  group_by(category_2) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup()

revenue_by_category2_tbl %>% 
  mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>% 
  ggplot(aes(category_2, revenue)) +
  geom_col(fill = "#2C3E50") +
  coord_flip() +
  scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) +
  labs(
    title = "Revenue by Product Category",
    x = "Category",
    y = "Revenue (Millions)"
  ) +
  theme_minimal()

7.3 Revenue by Primary Category

bike_orderlines_wrangled_tbl %>% 
  group_by(category_1) %>% 
  summarise(revenue = sum(total_price)) %>% 
  ungroup() %>% 
  mutate(category_1 = category_1 %>% as_factor() %>% fct_reorder(revenue)) %>% 
  ggplot(aes(category_1, revenue, fill = category_1)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Revenue by Primary Category",
    x = "Primary Category",
    y = "Revenue (Millions)"
  ) +
  theme_minimal()

8. Summary Statistics

cat("Total Revenue:", dollar(sum(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Total Revenue: $71,032,330
cat("Average Order Value:", dollar(mean(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Average Order Value: $4,540.55
cat("Total Orders:", nrow(bike_orderlines_wrangled_tbl), "\n")
## Total Orders: 15644
cat("Date Range:", min(bike_orderlines_wrangled_tbl$order_date), "to", 
    max(bike_orderlines_wrangled_tbl$order_date), "\n")
## Date Range: 1294358400 to 1451001600

9. Save Processed Data

# Save the wrangled data
saveRDS(bike_orderlines_wrangled_tbl, './bike_orderlines.rds')

Analysis complete! This report provides insights into bike sales trends, categories, and revenue patterns.