# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Import data
bikes_tbl <- read_excel("C:/Users/dell/OneDrive - University of Finance and Economics/Documents/datas_rstudio/bikes.xlsx")
bikeshops_tbl <- read_excel("C:/Users/dell/OneDrive - University of Finance and Economics/Documents/datas_rstudio/bikeshops.xlsx")
orderlines_tbl <- read_excel("C:/Users/dell/OneDrive - University of Finance and Economics/Documents/datas_rstudio/orderlines.xlsx")
## New names:
## • `` -> `...1`
# Join and wrangle data
bike_orderlines_wrangled_tbl <- left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id")) %>% 
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id")) %>%
  separate(description, 
           into = c('category.1', 'category.2', 'frame.material'), 
           sep  = ' - ') %>% 
  separate(location, 
           into = c('city', 'state'), 
           sep  = ', ',
           remove = FALSE) %>% 
  mutate(total.price = price * quantity) %>% 
  select(-...1, -location) %>% 
  select(contains('date'), contains('id'), 
         contains('order'), 
         quantity, price, total.price, 
         everything()) %>% 
  rename(order_date = order.date) %>%
  set_names(names(.) %>% str_replace_all("\\.", "_"))

# Analyze sales by product categories
category_sales_tbl <- bike_orderlines_wrangled_tbl %>%
  group_by(category_1, category_2, frame_material) %>%
  summarise(Sales = sum(total_price), .groups = 'drop') %>%
  arrange(desc(Sales)) %>%
  rename(`Prime category` = category_1,
         `Secondary category` = category_2,
         `Frame Material` = frame_material) %>%
  mutate(Sales = dollar(Sales))

# Display the results
print("Product Categories by Sales (Highest to Lowest):")
## [1] "Product Categories by Sales (Highest to Lowest):"
print(category_sales_tbl)
## # A tibble: 13 × 4
##    `Prime category` `Secondary category` `Frame Material` Sales      
##    <chr>            <chr>                <chr>            <chr>      
##  1 Mountain         Cross Country Race   Carbon           $15,906,070
##  2 Road             Elite Road           Carbon           $9,696,870 
##  3 Road             Endurance Road       Carbon           $8,768,610 
##  4 Mountain         Over Mountain        Carbon           $7,571,270 
##  5 Road             Elite Road           Aluminum         $5,637,795 
##  6 Mountain         Trail                Carbon           $4,835,850 
##  7 Mountain         Trail                Aluminum         $4,537,610 
##  8 Road             Triathalon           Carbon           $4,053,750 
##  9 Mountain         Cross Country Race   Aluminum         $3,318,560 
## 10 Road             Cyclocross           Carbon           $2,108,120 
## 11 Mountain         Sport                Aluminum         $1,932,755 
## 12 Road             Endurance Road       Aluminum         $1,612,450 
## 13 Mountain         Fat Bike             Aluminum         $1,052,620
# Create a visualization
viz_tbl <- bike_orderlines_wrangled_tbl %>%
  group_by(category_1, category_2, frame_material) %>%
  summarise(Sales = sum(total_price), .groups = 'drop') %>%
  arrange(desc(Sales))

# Top 15 categories plot
viz_tbl %>%
  head(15) %>%
  mutate(category = paste(category_1, category_2, frame_material, sep = " - "),
         category = fct_reorder(category, Sales)) %>%
  ggplot(aes(x = Sales, y = category)) +
  geom_col(fill = "#2c3e50") +
  scale_x_continuous(labels = dollar_format()) +
  labs(
    title = "Top 15 Product Categories by Sales",
    subtitle = "Mountain bikes lead in total revenue",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray40"),
    axis.title = element_text(face = "bold")
  )

# Summary statistics
cat("\n\nSummary Statistics:\n")
## 
## 
## Summary Statistics:
cat("Total Revenue:", dollar(sum(bike_orderlines_wrangled_tbl$total_price)), "\n")
## Total Revenue: $71,032,330
cat("Number of Product Categories:", nrow(category_sales_tbl), "\n")
## Number of Product Categories: 13
cat("Average Sales per Category:", dollar(mean(viz_tbl$Sales)), "\n")
## Average Sales per Category: $5,464,025