library(tidyverse)
library(readxl)
library(lubridate)
library(scales)
# Import Excel files
bikes_tbl <- read_excel("./bikes.xlsx")
bikeshops_tbl <- read_excel("./bikeshops.xlsx")
orderlines_tbl <- read_excel("./orderlines.xlsx")
# Display data structure
glimpse(bikes_tbl)
## Rows: 97
## Columns: 4
## $ bike.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ model <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Team", "Su…
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road - Carbon"…
## $ price <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, 2240, 18…
glimpse(bikeshops_tbl)
## Rows: 30
## Columns: 3
## $ bikeshop.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ bikeshop.name <chr> "Pittsburgh Mountain Machines", "Ithaca Mountain Climber…
## $ location <chr> "Pittsburgh, PA", "Ithaca, NY", "Columbus, OH", "Detroit…
glimpse(orderlines_tbl)
## Rows: 15,644
## Columns: 7
## $ ...1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "…
## $ order.id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7…
## $ order.line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2…
## $ order.date <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-01-1…
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 16, 16, 1…
## $ product.id <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 72, 45, 3…
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…
# Join all three tables
bike_orderlines_bikeshops_joined <- orderlines_tbl %>%
left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
head(bike_orderlines_bikeshops_joined)
## # A tibble: 6 × 12
## ...1 order.id order.line order.date customer.id product.id quantity
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48 1
## 2 2 1 2 2011-01-07 00:00:00 2 52 1
## 3 3 2 1 2011-01-10 00:00:00 10 76 1
## 4 4 2 2 2011-01-10 00:00:00 10 52 1
## 5 5 3 1 2011-01-10 00:00:00 6 2 1
## 6 6 3 2 2011-01-10 00:00:00 6 50 1
## # ℹ 5 more variables: model <chr>, description <chr>, price <dbl>,
## # bikeshop.name <chr>, location <chr>
# Decompose description and location, create calculated columns
bike_orderlines_wrangled_tbl <- bike_orderlines_bikeshops_joined %>%
separate(description,
into = c('category.1', 'category.2', 'frame.material'),
sep = ' - ') %>%
separate(location,
into = c('city', 'state'),
sep = ', ',
remove = FALSE) %>%
# Create calculated columns
mutate(total.price = price * quantity) %>%
# Reorganize columns
select(-...1, -location) %>%
# Reorder columns
select(contains('date'), contains('id'),
contains('order'),
quantity, price, total.price,
everything()) %>%
# Rename columns
rename(order_date = order.date) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
# Display cleaned data
head(bike_orderlines_wrangled_tbl)
## # A tibble: 6 × 15
## order_date order_id customer_id product_id order_line quantity price
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-01-07 00:00:00 1 2 48 1 1 6070
## 2 2011-01-07 00:00:00 1 2 52 2 1 5970
## 3 2011-01-10 00:00:00 2 10 76 1 1 2770
## 4 2011-01-10 00:00:00 2 10 52 2 1 5970
## 5 2011-01-10 00:00:00 3 6 2 1 1 10660
## 6 2011-01-10 00:00:00 3 6 50 2 1 3200
## # ℹ 8 more variables: total_price <dbl>, model <chr>, category_1 <chr>,
## # category_2 <chr>, frame_material <chr>, bikeshop_name <chr>, city <chr>,
## # state <chr>
cat("Category 1:\n")
## Category 1:
bike_orderlines_wrangled_tbl %>% distinct(category_1) %>% pull()
## [1] "Mountain" "Road"
cat("\nCategory 2:\n")
##
## Category 2:
bike_orderlines_wrangled_tbl %>% distinct(category_2) %>% pull()
## [1] "Over Mountain" "Trail" "Elite Road"
## [4] "Endurance Road" "Sport" "Cross Country Race"
## [7] "Cyclocross" "Triathalon" "Fat Bike"
cat("\nFrame Material:\n")
##
## Frame Material:
bike_orderlines_wrangled_tbl %>% distinct(frame_material) %>% pull()
## [1] "Carbon" "Aluminum"
bike_orderlines_wrangled_tbl %>%
group_by(category_1) %>%
summarise(Sales = sum(total_price)) %>%
ungroup() %>%
arrange(desc(Sales)) %>%
rename(`Primary Category` = category_1) %>%
mutate(Sales_Formatted = dollar(Sales)) %>%
knitr::kable(format.args = list(big.mark = ","))
| Primary Category | Sales | Sales_Formatted |
|---|---|---|
| Mountain | 39,154,735 | $39,154,735 |
| Road | 31,877,595 | $31,877,595 |
bikes_tbl %>%
select(model, price) %>%
arrange(desc(price)) %>%
head(10) %>%
mutate(price = dollar(price)) %>%
knitr::kable()
| model | price |
|---|---|
| Supersix Evo Black Inc. | $12,790 |
| Scalpel-Si Black Inc. | $12,790 |
| Habit Hi-Mod Black Inc. | $12,250 |
| F-Si Black Inc. | $11,190 |
| Supersix Evo Hi-Mod Team | $10,660 |
| Synapse Hi-Mod Disc Black Inc. | $9,590 |
| Scalpel-Si Race | $9,060 |
| F-Si Hi-Mod Team | $9,060 |
| Trigger Carbon 1 | $8,200 |
| Supersix Evo Hi-Mod Dura Ace 1 | $7,990 |
bike_sales_y <- bike_orderlines_wrangled_tbl %>%
select(order_date, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(sales = sum(total_price)) %>%
ungroup()
bike_sales_y %>%
mutate(sales = dollar(sales)) %>%
knitr::kable()
| year | sales |
|---|---|
| 2011 | $11,292,885 |
| 2012 | $12,163,075 |
| 2013 | $16,480,775 |
| 2014 | $13,924,085 |
| 2015 | $17,171,510 |
calculate_pct_diff <- function(data){
data %>%
mutate(sales_lag_1 = lag(sales, n = 1)) %>%
mutate(sales_lag_1 = case_when(
is.na(sales_lag_1) ~ sales,
TRUE ~ sales_lag_1
)) %>%
mutate(diff_1 = sales - sales_lag_1) %>%
mutate(pct_diff_1 = diff_1 / sales_lag_1) %>%
mutate(pct_diff_1_chr = percent(pct_diff_1))
}
bike_sales_y %>%
calculate_pct_diff() %>%
mutate(sales = dollar(sales),
diff_1 = dollar(diff_1)) %>%
select(year, sales, diff_1, pct_diff_1_chr) %>%
rename(Year = year,
Sales = sales,
`Change ($)` = diff_1,
`Change (%)` = pct_diff_1_chr) %>%
knitr::kable()
| Year | Sales | Change ($) | Change (%) |
|---|---|---|---|
| 2011 | $11,292,885 | $0 | 0.0% |
| 2012 | $12,163,075 | $870,190 | 7.7% |
| 2013 | $16,480,775 | $4,317,700 | 35.5% |
| 2014 | $13,924,085 | -$2,556,690 | -15.5% |
| 2015 | $17,171,510 | $3,247,425 | 23.3% |
bike_sales_m <- bike_orderlines_wrangled_tbl %>%
select(order_date, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(year_month = floor_date(order_date, unit = "month")) %>%
group_by(year_month) %>%
summarise(sales = sum(total_price))
head(bike_sales_m, 10) %>%
mutate(sales = dollar(sales)) %>%
knitr::kable()
| year_month | sales |
|---|---|
| 2011-01-01 | $483,015 |
| 2011-02-01 | $1,162,075 |
| 2011-03-01 | $659,975 |
| 2011-04-01 | $1,827,140 |
| 2011-05-01 | $844,170 |
| 2011-06-01 | $1,413,445 |
| 2011-07-01 | $1,194,430 |
| 2011-08-01 | $679,790 |
| 2011-09-01 | $814,720 |
| 2011-10-01 | $734,920 |
bike_sales_y %>%
ggplot(aes(x = year, y = sales, color = sales)) +
geom_point(size = 5) +
geom_line(linewidth = 2) +
geom_smooth(method = "lm", formula = 'y ~ x', se = FALSE) +
expand_limits(y = c(0, 20e6)) +
scale_colour_continuous(low = "red", high = "black",
labels = dollar_format(scale = 1/1e6, suffix = "M")) +
scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Revenue Trend",
subtitle = "Sales are trending up and to the right!",
x = "Year",
y = "Sales (Millions)",
color = "Rev ($M)",
caption = "Total sales from 2011 to 2015"
) +
theme_minimal()
revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarise(revenue = sum(total_price)) %>%
ungroup()
revenue_by_category2_tbl %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
ggplot(aes(category_2, revenue)) +
geom_col(fill = "#2C3E50") +
coord_flip() +
scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Revenue by Product Category",
x = "Category",
y = "Revenue (Millions)"
) +
theme_minimal()
bike_orderlines_wrangled_tbl %>%
group_by(category_1) %>%
summarise(revenue = sum(total_price)) %>%
ungroup() %>%
mutate(category_1 = category_1 %>% as_factor() %>% fct_reorder(revenue)) %>%
ggplot(aes(category_1, revenue, fill = category_1)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = dollar_format(scale = 1/1e6, suffix = "M")) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Revenue by Primary Category",
x = "Primary Category",
y = "Revenue (Millions)"
) +
theme_minimal()
cat("Total Revenue:", dollar(sum(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Total Revenue: $71,032,330
cat("Average Order Value:", dollar(mean(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Average Order Value: $4,540.55
cat("Total Orders:", nrow(bike_orderlines_wrangled_tbl), "\n")
## Total Orders: 15644
cat("Date Range:", min(bike_orderlines_wrangled_tbl$order_date), "to",
max(bike_orderlines_wrangled_tbl$order_date), "\n")
## Date Range: 1294358400 to 1451001600
# Save the wrangled data
saveRDS(bike_orderlines_wrangled_tbl, './bike_orderlines.rds')
Analysis complete! This report provides insights into bike sales trends, categories, and revenue patterns.