final homework

# Import Library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(readxl)

# Load base/dimension tables (for relational database demonstration)

orderlines <- read_excel("/Users/faizhaikal/Downloads/orderlines.xlsx")

## New names:
## • `` -> `...1`

bikes <- read_excel("/Users/faizhaikal/Downloads/bikes.xlsx")
bikeshops <- read_excel("/Users/faizhaikal/Downloads/bikeshops.xlsx")

# Load enriched transactional dataset (main analysis table)

sales <- read_csv("/Users/faizhaikal/Downloads/bike_orderlines.csv", show_col_types = FALSE)

# Robust parsing and cleaning to prevent non-finite values in plots

sales <- sales %>%
mutate(
order_date = parse_date_time(order_date, orders = c("Ymd HMS", "Ymd HM", "Ymd")),
total_price = readr::parse_number(as.character(total_price))
) %>%
filter(!is.na(order_date), !is.na(total_price)) %>%
mutate(
week = floor_date(order_date, "week", week_start = 1),
month = floor_date(order_date, "month"),
quarter_label = paste0(year(order_date), "-Q", quarter(order_date))
)

glimpse(sales)

## Rows: 15,644
## Columns: 16
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10, 2011-0…
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7…
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1…
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1…
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 5330, 1570,…
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast of the Ea…
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain", "Road",…
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Over Mounta…
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carbon", "Ca…
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Climbers",…
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City", "Loui…
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY", "KY", "…
## $ week           <dttm> 2011-01-03, 2011-01-03, 2011-01-10, 2011-01-10, 2011-0…
## $ month          <dttm> 2011-01-01, 2011-01-01, 2011-01-01, 2011-01-01, 2011-0…
## $ quarter_label  <chr> "2011-Q1", "2011-Q1", "2011-Q1", "2011-Q1", "2011-Q1", …

make_total_ts <- function(df, period_col, period_name) {
df %>%
group_by({{ period_col }}) %>%
summarise(total_sales = sum(total_price, na.rm = TRUE), .groups = "drop") %>%
filter(is.finite(total_sales)) %>%
mutate(period_type = period_name)
}

weekly_total <- make_total_ts(sales, week, "Weekly")

ggplot(weekly_total, aes(x = week, y = total_sales)) +
geom_line(linewidth = 0.95, alpha = 0.9, color = "#2c3e50") +
geom_smooth(se = FALSE, linewidth = 0.9, color = "#e74c3c") +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Total Sales (Weekly)",
subtitle = "Weekly view highlights short-term fluctuations and operational variability",
x = "Week",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

monthly_total <- make_total_ts(sales, month, "Monthly")

ggplot(monthly_total, aes(x = month, y = total_sales)) +
geom_line(linewidth = 1.05, alpha = 0.9, color = "#2c3e50") +
geom_point(size = 1.3, alpha = 0.85, color = "#3498db") +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Total Sales (Monthly)",
subtitle = "Monthly aggregation supports tactical planning and budgeting checks",
x = "Month",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12)

quarterly_total <- make_total_ts(sales, quarter_label, "Quarterly")

ggplot(quarterly_total, aes(x = quarter_label, y = total_sales, group = 1)) +
geom_line(linewidth = 1.05, alpha = 0.9, color = "#2c3e50") +
geom_point(size = 2.0, alpha = 0.9, color = "#9b59b6") +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Total Sales (Quarterly)",
subtitle = "Quarterly performance is commonly used for strategic reporting and evaluation",
x = "Quarter",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

shop_sales <- sales %>%
group_by(bikeshop_name) %>%
summarise(total_sales = sum(total_price, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))

top10_shops <- shop_sales %>% slice_head(n = 10)

ggplot(top10_shops, aes(x = reorder(bikeshop_name, total_sales), y = total_sales, fill = bikeshop_name)) +
geom_col(alpha = 0.9, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Top 10 Bikeshops by Total Sales",
subtitle = "Store-level performance comparison (highest to lowest)",
x = "Bikeshop",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12)

total_all <- sum(shop_sales$total_sales, na.rm = TRUE)
top5_total <- sum(shop_sales$total_sales[1:5], na.rm = TRUE)
top10_total <- sum(shop_sales$total_sales[1:10], na.rm = TRUE)

concentration <- tibble(
group = c("Top 5 Stores", "Top 10 Stores", "All Stores"),
total_sales = c(top5_total, top10_total, total_all),
share = c(top5_total/total_all, top10_total/total_all, 1)
)

concentration

## # A tibble: 3 × 3
##   group         total_sales share
##   <chr>               <dbl> <dbl>
## 1 Top 5 Stores     33151035 0.467
## 2 Top 10 Stores    45447210 0.640
## 3 All Stores       71032330 1

state_sales <- sales %>%
group_by(state) %>%
summarise(total_sales = sum(total_price, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))

top10_states <- state_sales %>% slice_head(n = 10)

ggplot(top10_states, aes(x = reorder(state, total_sales), y = total_sales, fill = state)) +
geom_col(alpha = 0.9, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Top 10 States by Total Sales",
subtitle = "Geographic concentration of revenue across states",
x = "State",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12)

city_sales <- sales %>%
group_by(city) %>%
summarise(total_sales = sum(total_price, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(total_sales))

top10_cities <- city_sales %>% slice_head(n = 10)

ggplot(top10_cities, aes(x = reorder(city, total_sales), y = total_sales, fill = city)) +
geom_col(alpha = 0.9, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Top 10 Cities by Total Sales",
subtitle = "City-level revenue comparison (highest to lowest)",
x = "City",
y = "Total Sales (USD)"
) +
theme_minimal(base_size = 12)

final homework

Faiz Haikal_114035108

2025-12-23