library(tidyverse)
library(readxl)
library(scales)
# Option 1: If your data files are on Desktop
bikes_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/bikes (1).xlsx")
bikeshops_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/bikeshops.xlsx")
orderlines_tbl <- read_excel("C:/Users/ASUS/OneDrive/Desktop/orderlines.xlsx")
# Option 2: If files are in the same folder as your .Rmd file
# bikes_tbl <- read_excel("bikes.xlsx")
# bikeshops_tbl <- read_excel("bikeshops.xlsx")
# orderlines_tbl <- read_excel("orderlines.xlsx")
# Join all three tables together
bike_orderlines_joined_tbl <- orderlines_tbl %>%
left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
# Check the joined data
head(bike_orderlines_joined_tbl)
## # A tibble: 6 × 12
## ...1 order.id order.line order.date customer.id product.id quantity
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl> <dbl>
## 1 1 1 1 2011-01-07 00:00:00 2 48 1
## 2 2 1 2 2011-01-07 00:00:00 2 52 1
## 3 3 2 1 2011-01-10 00:00:00 10 76 1
## 4 4 2 2 2011-01-10 00:00:00 10 52 1
## 5 5 3 1 2011-01-10 00:00:00 6 2 1
## 6 6 3 2 2011-01-10 00:00:00 6 50 1
## # ℹ 5 more variables: model <chr>, description <chr>, price <dbl>,
## # bikeshop.name <chr>, location <chr>
# Separate description column and create calculated columns
bike_orderlines_wrangled_tbl <- bike_orderlines_joined_tbl %>%
# Separate description into three parts
separate(description,
into = c("category_1", "category_2", "frame_material"),
sep = " - ") %>%
# Separate location into city and state
separate(location,
into = c("city", "state"),
sep = ", ",
remove = FALSE) %>%
# Create total price column
mutate(total_price = price * quantity) %>%
# Remove unnecessary columns
select(-...1, -location) %>%
# Rename columns to use underscores instead of dots
set_names(names(.) %>% str_replace_all("\\.", "_"))
# View the wrangled data
head(bike_orderlines_wrangled_tbl)
## # A tibble: 6 × 15
## order_id order_line order_date customer_id product_id quantity model
## <dbl> <dbl> <dttm> <dbl> <dbl> <dbl> <chr>
## 1 1 1 2011-01-07 00:00:00 2 48 1 Jekyl…
## 2 1 2 2011-01-07 00:00:00 2 52 1 Trigg…
## 3 2 1 2011-01-10 00:00:00 10 76 1 Beast…
## 4 2 2 2011-01-10 00:00:00 10 52 1 Trigg…
## 5 3 1 2011-01-10 00:00:00 6 2 1 Super…
## 6 3 2 2011-01-10 00:00:00 6 50 1 Jekyl…
## # ℹ 8 more variables: category_1 <chr>, category_2 <chr>, frame_material <chr>,
## # price <dbl>, bikeshop_name <chr>, city <chr>, state <chr>,
## # total_price <dbl>
# Group by category_2 and sum the total_price
revenue_by_category2_tbl <- bike_orderlines_wrangled_tbl %>%
select(category_2, total_price) %>%
group_by(category_2) %>%
summarise(revenue = sum(total_price)) %>%
ungroup()
# Display the revenue table
revenue_by_category2_tbl
## # A tibble: 9 × 2
## category_2 revenue
## <chr> <dbl>
## 1 Cross Country Race 19224630
## 2 Cyclocross 2108120
## 3 Elite Road 15334665
## 4 Endurance Road 10381060
## 5 Fat Bike 1052620
## 6 Over Mountain 7571270
## 7 Sport 1932755
## 8 Trail 9373460
## 9 Triathalon 4053750
# Create horizontal bar chart
revenue_by_category2_tbl %>%
# Reorder categories by revenue (descending)
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(revenue)) %>%
# Create plot
ggplot(aes(x = category_2, y = revenue)) +
geom_col(fill = "blue") +
# Flip coordinates to make horizontal bars
coord_flip() +
# Format y-axis labels as numbers with scientific notation
scale_y_continuous(labels = label_number(scale = 1e-6, suffix = "e+06")) +
# Add labels
labs(
title = "Revenue by Product Category",
x = "category_2",
y = "revenue"
) +
# Use minimal theme
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12)
)
# Show top 5 categories by revenue
revenue_by_category2_tbl %>%
arrange(desc(revenue)) %>%
mutate(revenue_formatted = dollar(revenue)) %>%
head(5)
## # A tibble: 5 × 3
## category_2 revenue revenue_formatted
## <chr> <dbl> <chr>
## 1 Cross Country Race 19224630 $19,224,630
## 2 Elite Road 15334665 $15,334,665
## 3 Endurance Road 10381060 $10,381,060
## 4 Trail 9373460 $9,373,460
## 5 Over Mountain 7571270 $7,571,270
# Total revenue across all categories
total_revenue <- sum(revenue_by_category2_tbl$revenue)
cat("Total Revenue:", dollar(total_revenue), "\n")
## Total Revenue: $71,032,330
```