Bike Sales Plots (robust)

# ---- Robust single-chunk plot with diagnostics ----

# 0. helper to install missing packages (uncomment to auto-install)
needed <- c("tidyverse", "lubridate", "scales")
missing_pkgs <- needed[!needed %in% installed.packages()[, "Package"]]
if(length(missing_pkgs)) {
  message("Installing missing packages: ", paste(missing_pkgs, collapse = ", "))
  install.packages(missing_pkgs)
}

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

# 1. load the data
file_name <- "bike_orderlines.rds"
if(!file.exists(file_name)) stop(glue::glue("File not found: {file_name}. Put the .rds in your project folder."))

bike_data <- readRDS(file_name)

# 2. show structure so you can paste it if errors occur
message("\n---- DATA DIAGNOSTICS ----")

## 
## ---- DATA DIAGNOSTICS ----

message("Number of rows: ", nrow(bike_data))

## Number of rows: 15644

message("Columns:")

## Columns:

print(colnames(bike_data))

##  [1] "order_date"     "order_id"       "customer_id"    "product_id"    
##  [5] "order_line"     "quantity"       "price"          "total_price"   
##  [9] "model"          "category_1"     "category_2"     "frame_material"
## [13] "bikeshop_name"  "city"           "state"

message("\nA few rows:")

## 
## A few rows:

print(utils::head(bike_data, 6))

## # A tibble: 6 × 15
##   order_date          order_id customer_id product_id order_line quantity price
##   <dttm>                 <dbl>       <dbl>      <dbl>      <dbl>    <dbl> <dbl>
## 1 2011-01-07 00:00:00        1           2         48          1        1  6070
## 2 2011-01-07 00:00:00        1           2         52          2        1  5970
## 3 2011-01-10 00:00:00        2          10         76          1        1  2770
## 4 2011-01-10 00:00:00        2          10         52          2        1  5970
## 5 2011-01-10 00:00:00        3           6          2          1        1 10660
## 6 2011-01-10 00:00:00        3           6         50          2        1  3200
## # ℹ 8 more variables: total_price <dbl>, model <chr>, category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>, city <chr>,
## #   state <chr>

# 3. identify date, category and price columns (try common alternatives)
col_names <- tolower(colnames(bike_data))

# find date column: prefer exact 'order_date', else any with 'date' in name
date_col <- NULL
if("order_date" %in% col_names) date_col <- colnames(bike_data)[which(col_names == "order_date")]
if(is.null(date_col)) {
  cand <- grep("date", col_names, value = TRUE)
  if(length(cand) >= 1) date_col <- colnames(bike_data)[which(col_names %in% cand)[1]]
}
if(is.null(date_col)) stop("No date column found. Expected 'order_date' or another column containing 'date' in its name. Columns: \n", paste(colnames(bike_data), collapse = ", "))

# find price column: prefer 'total_price' else 'price'/'total'/'revenue'
price_col <- NULL
for (nm in c("total_price","totalprice","price","total","revenue","amount")) {
  if(nm %in% col_names) { price_col <- colnames(bike_data)[which(col_names == nm)]; break }
}
if(is.null(price_col)) stop("No price column found. Expected 'total_price' or 'price' or 'total' etc. Columns: \n", paste(colnames(bike_data), collapse = ", "))

# find category column: prefer 'category_2' else 'category'/'cat'/'type'
cat_col <- NULL
for (nm in c("category_2","category2","category","cat","type","category_1")) {
  if(nm %in% col_names) { cat_col <- colnames(bike_data)[which(col_names == nm)]; break }
}
if(is.null(cat_col)) stop("No category column found. Expected 'category_2' or 'category' etc. Columns: \n", paste(colnames(bike_data), collapse = ", "))

message("\nDetected columns -> date: ", date_col, " | price: ", price_col, " | category: ", cat_col)

## 
## Detected columns -> date: order_date | price: total_price | category: category_2

# 4. convert date column to Date (try several formats)
bike_data[[date_col]] <- {
  x <- bike_data[[date_col]]
  if(inherits(x, "Date") || inherits(x, "POSIXt")) {
    as_date(x)
  } else {
    # try common parsers
    parsed <- parse_date_time(x, orders = c("Ymd", "ymd", "dmy", "mdy", "Y-m-d", "m/d/Y", "d/m/Y", "Ymd HMS"))
    if(all(is.na(parsed))) {
      # last resort: as.Date
      parsed2 <- as.Date(x)
      if(all(is.na(parsed2))) stop("Failed to parse the date column. Sample values: \n", paste(head(x,5), collapse = ", "))
      parsed2
    } else {
      as_date(parsed)
    }
  }
}

# 5. ensure price is numeric
bike_data[[price_col]] <- as.numeric(bike_data[[price_col]])
if(all(is.na(bike_data[[price_col]]))) stop("Price column coerced to NA. Check that ", price_col, " contains numeric values.")

# 6. ensure category is factor/character
bike_data[[cat_col]] <- as.character(bike_data[[cat_col]])
if(all(is.na(bike_data[[cat_col]]))) stop("Category column coerced to NA. Check that ", cat_col, " contains values.")

# 7. prepare monthly summary
bike_monthly_sales <- bike_data %>%
  mutate(order_month = floor_date(.data[[date_col]], "month")) %>%
  group_by(category = .data[[cat_col]], order_month) %>%
  summarise(sales = sum(.data[[price_col]], na.rm = TRUE), .groups = "drop")

message("\nSummary example (top rows):")

## 
## Summary example (top rows):

print(utils::head(bike_monthly_sales, 8))

## # A tibble: 8 × 3
##   category           order_month  sales
##   <chr>              <date>       <dbl>
## 1 Cross Country Race 2011-01-01  143660
## 2 Cross Country Race 2011-02-01  324400
## 3 Cross Country Race 2011-03-01  142000
## 4 Cross Country Race 2011-04-01  498580
## 5 Cross Country Race 2011-05-01  220310
## 6 Cross Country Race 2011-06-01  364420
## 7 Cross Country Race 2011-07-01  307300
## 8 Cross Country Race 2011-08-01  110600

# 8. plot
gg <- bike_monthly_sales %>%
  ggplot(aes(order_month, sales, color = category)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
  facet_wrap(~ category, scales = "free_y") +
  labs(title = "Sales by Category Over Time", x = "", y = "Sales") +
  theme_minimal() +
  theme(legend.position = "none",
        strip.background = element_rect(fill = "navy"),
        strip.text = element_text(color = "white", size = 11))

print(gg)