# Install packages if needed (run once)
# install.packages(c("readxl", "tidyverse", "lubridate", "tidyquant", "writexl"))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(writexl)
library(lubridate)
library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.28 ✔ xts 0.14.1── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Import Excel files
bikes <- read_excel("~/Downloads/bikes.xlsx")
bikeshops <- read_excel("~/Downloads/bikeshops.xlsx")
orderlines <- read_excel("~/Downloads/orderlines.xlsx")
## New names:
## • `` -> `...1`
# Examine the data structure
glimpse(bikes)
## Rows: 97
## Columns: 4
## $ bike.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ model <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Team", "Su…
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road - Carbon"…
## $ price <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, 2240, 18…
glimpse(bikeshops)
## Rows: 30
## Columns: 3
## $ bikeshop.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ bikeshop.name <chr> "Pittsburgh Mountain Machines", "Ithaca Mountain Climber…
## $ location <chr> "Pittsburgh, PA", "Ithaca, NY", "Columbus, OH", "Detroit…
glimpse(orderlines)
## Rows: 15,644
## Columns: 7
## $ ...1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "…
## $ order.id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7…
## $ order.line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2…
## $ order.date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-01-1…
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 16, 16, 1…
## $ product.id <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 72, 45, 3…
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…
###2. Data Wrangling Join Tables
# Join all three tables using the pipe operator
bike_orderlines_wrangled <- orderlines %>%
left_join(bikes, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops, by = c("customer.id" = "bikeshop.id")) %>%
# Separate description into multiple columns
separate(description,
into = c('category.1', 'category.2', 'frame.material'),
sep = ' - ') %>%
# Separate location into city and state
separate(location,
into = c('city', 'state'),
sep = ', ',
remove = FALSE) %>%
# Calculate total price
mutate(total.price = price * quantity) %>%
# Reorganize and rename columns
select(-...1, -location) %>%
select(contains('date'), contains('id'),
contains('order'),
quantity, price, total.price,
everything()) %>%
rename(order_date = order.date) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
# Display first few rows
head(bike_orderlines_wrangled)
## # A tibble: 6 × 15
## order_date order_id customer_id product_id order_line quantity price
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-01-07 00:00:00 1 2 48 1 1 6070
## 2 2011-01-07 00:00:00 1 2 52 2 1 5970
## 3 2011-01-10 00:00:00 2 10 76 1 1 2770
## 4 2011-01-10 00:00:00 2 10 52 2 1 5970
## 5 2011-01-10 00:00:00 3 6 2 1 1 10660
## 6 2011-01-10 00:00:00 3 6 50 2 1 3200
## # ℹ 8 more variables: total_price <dbl>, model <chr>, category_1 <chr>,
## # category_2 <chr>, frame_material <chr>, bikeshop_name <chr>, city <chr>,
## # state <chr>
# Save cleaned data
saveRDS(bike_orderlines_wrangled, './bike_orderlines.rds')
###3. Exploratory Data Analysis Key dplyr Functions Select and Filter Operations
# Calculate average total price using pull()
avg_price <- bike_orderlines_wrangled %>%
pull(total_price) %>%
mean()
cat("Average Order Total Price: $", round(avg_price, 2), "\n")
## Average Order Total Price: $ 4540.55
# Filter bikes above average price
bikes %>%
select(model, price) %>%
filter(price > mean(price)) %>%
arrange(desc(price)) %>%
head(10)
## # A tibble: 10 × 2
## model price
## <chr> <dbl>
## 1 Supersix Evo Black Inc. 12790
## 2 Scalpel-Si Black Inc. 12790
## 3 Habit Hi-Mod Black Inc. 12250
## 4 F-Si Black Inc. 11190
## 5 Supersix Evo Hi-Mod Team 10660
## 6 Synapse Hi-Mod Disc Black Inc. 9590
## 7 Scalpel-Si Race 9060
## 8 F-Si Hi-Mod Team 9060
## 9 Trigger Carbon 1 8200
## 10 Supersix Evo Hi-Mod Dura Ace 1 7990
# Q1: What are the unique categories of products?
cat("Primary Categories:\n")
## Primary Categories:
bike_orderlines_wrangled %>% distinct(category_1) %>% pull()
## [1] "Mountain" "Road"
cat("\nSecondary Categories:\n")
##
## Secondary Categories:
bike_orderlines_wrangled %>% distinct(category_2) %>% pull()
## [1] "Over Mountain" "Trail" "Elite Road"
## [4] "Endurance Road" "Sport" "Cross Country Race"
## [7] "Cyclocross" "Triathalon" "Fat Bike"
cat("\nFrame Materials:\n")
##
## Frame Materials:
bike_orderlines_wrangled %>% distinct(frame_material) %>% pull()
## [1] "Carbon" "Aluminum"
###Sales by Category
# Q2: Which product categories have the largest sales?
sales_by_category <- bike_orderlines_wrangled %>%
select(category_1, total_price) %>%
group_by(category_1) %>%
summarise(sales = sum(total_price)) %>%
ungroup() %>%
arrange(desc(sales)) %>%
rename(`Primary Category` = category_1,
Sales = sales) %>%
mutate(Sales_Formatted = scales::dollar(Sales))
knitr::kable(sales_by_category,
caption = "Total Sales by Primary Category",
format.args = list(big.mark = ","))
| Primary Category | Sales | Sales_Formatted |
|---|---|---|
| Mountain | 39,154,735 | $39,154,735 |
| Road | 31,877,595 | $31,877,595 |
###4. Time Series Analysis Sales Trends Over Time
# Create function to calculate percentage differences
calculate_pct_diff <- function(data){
data %>%
mutate(sales_lag_1 = lag(sales, n = 1)) %>%
mutate(sales_lag_1 = case_when(
is.na(sales_lag_1) ~ sales,
TRUE ~ sales_lag_1
)) %>%
mutate(diff_1 = sales - sales_lag_1) %>%
mutate(pct_diff_1 = diff_1 / sales_lag_1) %>%
mutate(pct_diff_1_chr = scales::percent(pct_diff_1))
}
# Calculate yearly sales
bike_sales_y <- bike_orderlines_wrangled %>%
select(order_date, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(sales = sum(total_price)) %>%
ungroup()
# Apply percentage difference calculation
bike_sales_y_analysis <- calculate_pct_diff(bike_sales_y)
knitr::kable(bike_sales_y_analysis,
caption = "Yearly Sales with Year-over-Year Growth",
format.args = list(big.mark = ","))
| year | sales | sales_lag_1 | diff_1 | pct_diff_1 | pct_diff_1_chr |
|---|---|---|---|---|---|
| 2,011 | 11,292,885 | 11,292,885 | 0 | 0.0000000 | 0.0% |
| 2,012 | 12,163,075 | 11,292,885 | 870,190 | 0.0770565 | 7.7% |
| 2,013 | 16,480,775 | 12,163,075 | 4,317,700 | 0.3549842 | 35.5% |
| 2,014 | 13,924,085 | 16,480,775 | -2,556,690 | -0.1551317 | -15.5% |
| 2,015 | 17,171,510 | 13,924,085 | 3,247,425 | 0.2332236 | 23.3% |
###Monthly Sales Aggregation
# Calculate monthly sales
bike_sales_m <- bike_orderlines_wrangled %>%
select(order_date, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(year_month = floor_date(order_date, unit = "month")) %>%
group_by(year_month) %>%
summarise(sales = sum(total_price)) %>%
ungroup()
# Show first and last 6 months
head(bike_sales_m, 6)
## # A tibble: 6 × 2
## year_month sales
## <date> <dbl>
## 1 2011-01-01 483015
## 2 2011-02-01 1162075
## 3 2011-03-01 659975
## 4 2011-04-01 1827140
## 5 2011-05-01 844170
## 6 2011-06-01 1413445
tail(bike_sales_m, 6)
## # A tibble: 6 × 2
## year_month sales
## <date> <dbl>
## 1 2015-07-01 1166455
## 2 2015-08-01 955090
## 3 2015-09-01 767355
## 4 2015-10-01 949455
## 5 2015-11-01 1065885
## 6 2015-12-01 926070
###5. Data Visualizations Line Chart: Revenue Trends
bike_sales_y %>%
ggplot(aes(x = year, y = sales, color = sales)) +
geom_point(size = 5) +
geom_line(linewidth = 2) +
geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
expand_limits(y = c(0, 20e6)) +
scale_colour_continuous(low = "green", high = "pink",
labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Revenue Trend Analysis",
subtitle = "Sales are trending up and to the right!",
x = "Year",
y = "Sales (Millions)",
color = "Rev ($M)",
caption = "Total sales from 2011 to 2015"
) +
theme_tq()
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
###Bar Chart: Revenue by Category
revenue_by_category2 <- bike_orderlines_wrangled %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarise(revenue = sum(total_price)) %>%
ungroup()
revenue_by_category2 %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
ggplot(aes(category_2, revenue)) +
geom_col(fill = "#2c3e50") +
coord_flip() +
scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Revenue by Bike Category",
x = "Category",
y = "Revenue (Millions)"
) +
theme_tq()
###Scatter Plot: Order Value Analysis
order_value_tbl <- bike_orderlines_wrangled %>%
select(order_id, order_line, total_price, quantity) %>%
group_by(order_id) %>%
summarize(
total_quantity = sum(quantity),
total_price = sum(total_price)
) %>%
ungroup()
order_value_tbl %>%
ggplot(aes(x = total_quantity, y = total_price)) +
geom_point(alpha = 0.5, size = 2, color = "#F527F2") +
geom_smooth(method = "lm", se = TRUE, color = "#9c2e50") +
scale_y_continuous(labels = scales::dollar_format()) +
labs(
title = "Order Value vs. Quantity",
subtitle = "Relationship between items ordered and total price",
x = "Total Quantity",
y = "Total Price"
) +
theme_tq()
## `geom_smooth()` using formula = 'y ~ x'
###Histogram: Price Distribution
bike_orderlines_wrangled %>%
distinct(price, model, frame_material) %>%
ggplot(aes(price, fill = frame_material)) +
geom_histogram(bins = 30, color = "purple") +
facet_wrap(~ frame_material, ncol = 1) +
scale_fill_tq() +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Price Distribution by Frame Material",
x = "Price",
y = "Count",
fill = "Frame Material"
) +
theme_tq() +
theme(legend.position = "none")
###Box Plot: Price by Category
unit_price_by_cat2 <- bike_orderlines_wrangled %>%
select(category_2, model, price) %>%
distinct() %>%
mutate(category_2 = as_factor(category_2) %>% fct_reorder(price))
unit_price_by_cat2 %>%
ggplot(aes(category_2, price)) +
geom_boxplot(fill = "#F54927", alpha = 0.7) +
coord_flip() +
scale_y_continuous(labels = scales::dollar_format()) +
labs(
title = "Price Distribution by Bike Category",
x = "Category",
y = "Price"
) +
theme_tq()
n <- 10
top_customers <- bike_orderlines_wrangled %>%
select(bikeshop_name, total_price) %>%
mutate(bikeshop_name = as_factor(bikeshop_name) %>% fct_lump(n = n, w = total_price)) %>%
group_by(bikeshop_name) %>%
summarize(revenue = sum(total_price)) %>%
ungroup() %>%
mutate(bikeshop_name = bikeshop_name %>% fct_reorder(revenue)) %>%
mutate(bikeshop_name = bikeshop_name %>% fct_relevel("Other", after = 0)) %>%
arrange(desc(bikeshop_name)) %>%
mutate(revenue_text = scales::dollar(revenue)) %>%
mutate(cum_pct = cumsum(revenue) / sum(revenue)) %>%
mutate(cum_pct_txt = scales::percent(cum_pct)) %>%
mutate(rank = row_number()) %>%
mutate(rank = case_when(
rank == max(rank) ~ NA_integer_,
TRUE ~ rank
)) %>%
mutate(label_text = str_glue("Rank: {rank}\nRev: {revenue_text}\nCum: {cum_pct_txt}"))
top_customers %>%
ggplot(aes(revenue, bikeshop_name)) +
geom_segment(aes(xend = 0, yend = bikeshop_name), color = "gray50") +
geom_point(aes(size = revenue), color = "#18BC9C") +
geom_label(aes(label = label_text), hjust = "inward", size = 3) +
scale_x_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Top 10 Customers by Revenue",
subtitle = "With cumulative percentage contribution",
x = "Revenue (Millions)",
y = "Customer"
) +
theme_tq() +
theme(legend.position = "none")
###Heatmap: Customer Purchasing Patterns
pct_sales_by_customer <- bike_orderlines_wrangled %>%
select(bikeshop_name, category_1, category_2, quantity) %>%
group_by(bikeshop_name, category_1, category_2) %>%
summarise(total_qty = sum(quantity)) %>%
ungroup() %>%
group_by(bikeshop_name) %>%
mutate(pct = total_qty / sum(total_qty)) %>%
ungroup() %>%
mutate(bikeshop_name = as.factor(bikeshop_name) %>% fct_rev())
## `summarise()` has grouped output by 'bikeshop_name', 'category_1'. You can
## override using the `.groups` argument.
pct_sales_by_customer %>%
ggplot(aes(category_2, bikeshop_name)) +
geom_tile(aes(fill = pct)) +
geom_text(aes(label = scales::percent(pct, accuracy = 1)), size = 2.5) +
facet_wrap(~ category_1, scales = "free_x") +
scale_fill_gradient(low = "white", high = "#2c3e50") +
labs(
title = "Heatmap of Customer Purchasing Habits",
subtitle = "Percentage of purchases by category",
x = "Bike Type (Category 2)",
y = "Customer",
fill = "% of Sales"
) +
theme_tq() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
plot.title = element_text(face = "bold")
)
###6. Feature Engineering Custom Function: Separate Bike Model
separate_bike_model <- function(data, append = TRUE) {
if (!append) {
data <- data %>% select(model)
}
output_tbl <- data %>%
select(model) %>%
# Fix typos
mutate(model = case_when(
model == "CAAD Disc Ultegra" ~ "CAAD12 Disc Ultegra",
model == "Supersix Evo Hi-Mod Utegra" ~ "Supersix Evo Hi-Mod Ultegra",
model == "Syapse Carbon Tiagra" ~ "Synapse Carbon Tiagra",
TRUE ~ model
)) %>%
# Separate by spaces
separate(col = model,
into = str_c("model_", 1:7),
sep = " ",
remove = FALSE,
fill = "right") %>%
# Extract base model
mutate(model_base = case_when(
str_detect(str_to_lower(model_1), "supersix") ~ str_c(model_1, model_2, sep = " "),
str_detect(str_to_lower(model_1), "beast") ~ str_c(model_1, model_2, model_3, model_4, sep = " "),
str_detect(str_to_lower(model_1), "bad") ~ str_c(model_1, model_2, sep = " "),
str_detect(str_to_lower(model_1), "fat") ~ str_c(model_1, model_2, sep = " "),
str_detect(str_to_lower(model_1), "29") ~ str_c(model_1, model_2, sep = " "),
TRUE ~ model_1
)) %>%
# Extract tier
mutate(model_tier = model %>% str_replace(model_base, replacement = "") %>% str_trim()) %>%
select(-matches("model_[0-9]")) %>%
# Create feature flags
mutate(
black = model_tier %>% str_to_lower() %>% str_detect("black") %>% as.numeric(),
red = model_tier %>% str_to_lower() %>% str_detect("red") %>% as.numeric(),
hi_mod = model_tier %>% str_to_lower() %>% str_detect("hi_mod") %>% as.numeric(),
team = model_tier %>% str_to_lower() %>% str_detect("team") %>% as.numeric(),
ultegra = model_tier %>% str_to_lower() %>% str_detect("ultegra") %>% as.numeric(),
dura_ace = model_tier %>% str_to_lower() %>% str_detect("dura_ace") %>% as.numeric(),
disc = model_tier %>% str_to_lower() %>% str_detect("disc") %>% as.numeric()
)
return(output_tbl)
}
# Apply the function
bikes_enhanced <- separate_bike_model(bikes, append = TRUE)
# Display sample
head(bikes_enhanced) %>%
select(model, model_base, model_tier, black, ultegra, disc) %>%
knitr::kable(caption = "Sample of Enhanced Bike Features")
| model | model_base | model_tier | black | ultegra | disc |
|---|---|---|---|---|---|
| Supersix Evo Black Inc. | Supersix Evo | Black Inc. | 1 | 0 | 0 |
| Supersix Evo Hi-Mod Team | Supersix Evo | Hi-Mod Team | 0 | 0 | 0 |
| Supersix Evo Hi-Mod Dura Ace 1 | Supersix Evo | Hi-Mod Dura Ace 1 | 0 | 0 | 0 |
| Supersix Evo Hi-Mod Dura Ace 2 | Supersix Evo | Hi-Mod Dura Ace 2 | 0 | 0 | 0 |
| Supersix Evo Hi-Mod Ultegra | Supersix Evo | Hi-Mod Ultegra | 0 | 1 | 0 |
| Supersix Evo Red | Supersix Evo | Red | 0 | 0 | 0 |
###7. Summary and Insights Key Findings Hide
cat("=== ANALYSIS SUMMARY ===\n\n")
## === ANALYSIS SUMMARY ===
cat("Total Revenue:", scales::dollar(sum(bike_orderlines_wrangled$total_price)), "\n")
## Total Revenue: $71,032,330
cat("Total Orders:", n_distinct(bike_orderlines_wrangled$order_id), "\n")
## Total Orders: 2000
cat("Total Customers:", n_distinct(bike_orderlines_wrangled$bikeshop_name), "\n")
## Total Customers: 30
cat("Average Order Value:", scales::dollar(mean(order_value_tbl$total_price)), "\n\n")
## Average Order Value: $35,516.16
cat("Sales Growth (2011-2015):",
bike_sales_y_analysis$pct_diff_1_chr[nrow(bike_sales_y_analysis)], "\n")
## Sales Growth (2011-2015): 23.3%