# ── Core data manipulation ──────────────────────────────────────────────────
library(tidyverse)
library(lubridate)
library(scales)
# ── Visualisation ───────────────────────────────────────────────────────────
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(treemapify) # treemap plots
# ── Tables ──────────────────────────────────────────────────────────────────
library(knitr)
library(kableExtra)
library(DT) # interactive HTML tables
# ── Unsupervised Learning ────────────────────────────────────────────────────
library(cluster) # k-means, PAM, silhouette
library(factoextra) # cluster visualisation
library(NbClust) # optimal k selection
library(corrplot) # correlation matrix
# ── Supervised Learning ──────────────────────────────────────────────────────
library(caret) # train/test + CV framework
library(randomForest) # Random Forest classifier / regressor
library(xgboost) # Gradient Boosting
library(rpart) # Decision Trees
library(rpart.plot) # Decision Tree visualisation
library(pROC) # ROC / AUC
library(ROSE) # class imbalance (ROSE / SMOTE)
# ── Text ─────────────────────────────────────────────────────────────────────
library(tidytext)
library(wordcloud)
library(SnowballC)# ── Adjust this path to wherever your CSVs live ─────────────────────────────
DATA_PATH <- "C:/Users/PAYTAKHT/Desktop/spinaker/data/"
customers <- read_csv(paste0(DATA_PATH, "olist_customers_dataset.csv"))
geolocation <- read_csv(paste0(DATA_PATH, "olist_geolocation_dataset.csv"))
order_items <- read_csv(paste0(DATA_PATH, "olist_order_items_dataset.csv"))
order_payments<- read_csv(paste0(DATA_PATH, "olist_order_payments_dataset.csv"))
order_reviews <- read_csv(paste0(DATA_PATH, "olist_order_reviews_dataset.csv"))
orders <- read_csv(paste0(DATA_PATH, "olist_orders_dataset.csv"))
products <- read_csv(paste0(DATA_PATH, "olist_products_dataset.csv"))
sellers <- read_csv(paste0(DATA_PATH, "olist_sellers_dataset.csv"))
category_trans<- read_csv(paste0(DATA_PATH, "product_category_name_translation.csv"))
cat("Datasets loaded successfully.\n")## Datasets loaded successfully.
cat(sprintf("Orders: %s rows | Products: %s rows | Items: %s rows\n",
nrow(orders), nrow(products), nrow(order_items)))## Orders: 99441 rows | Products: 32951 rows | Items: 112650 rows
# ── 1. Translate product categories to English ───────────────────────────────
products <- products %>%
left_join(category_trans, by = "product_category_name") %>%
mutate(
category_en = coalesce(product_category_name_english, "unknown")
)
# ── 2. Parse all date columns ────────────────────────────────────────────────
orders <- orders %>%
mutate(across(contains("date") | contains("timestamp"),
~ymd_hms(.x, quiet = TRUE)))
# ── 3. Master order-level table ──────────────────────────────────────────────
orders_full <- orders %>%
left_join(customers, by = "customer_id") %>%
left_join(order_payments %>%
group_by(order_id) %>%
summarise(
total_payment = sum(payment_value, na.rm = TRUE),
n_installments = max(payment_installments, na.rm = TRUE),
payment_type = first(payment_type),
.groups = "drop"),
by = "order_id") %>%
left_join(order_reviews %>%
group_by(order_id) %>%
slice_max(review_score, with_ties = FALSE) %>%
select(order_id, review_score),
by = "order_id") %>%
mutate(
delivery_days = as.numeric(
difftime(order_delivered_customer_date,
order_purchase_timestamp, units = "days")),
delay_days = as.numeric(
difftime(order_delivered_customer_date,
order_estimated_delivery_date, units = "days")),
is_late = delay_days > 0,
purchase_year = year(order_purchase_timestamp),
purchase_month = month(order_purchase_timestamp, label = TRUE),
purchase_dow = wday(order_purchase_timestamp, label = TRUE)
)
# ── 4. Item-level enriched table ─────────────────────────────────────────────
items_full <- order_items %>%
left_join(products %>%
select(product_id, category_en,
product_weight_g, product_photos_qty),
by = "product_id") %>%
left_join(orders %>%
select(order_id, order_status, order_purchase_timestamp),
by = "order_id") %>%
left_join(sellers %>%
select(seller_id, seller_state),
by = "seller_id") %>%
mutate(
revenue = price + freight_value,
category_en = replace_na(category_en, "unknown")
)
# ── 5. Product-level summary (sales + "unsold" flag) ────────────────────────
delivered_items <- items_full %>%
filter(order_status == "delivered")
product_sales <- products %>%
left_join(
delivered_items %>%
group_by(product_id) %>%
summarise(
units_sold = n(),
total_revenue = sum(revenue, na.rm = TRUE),
avg_price = mean(price, na.rm = TRUE),
n_orders = n_distinct(order_id),
.groups = "drop"),
by = "product_id") %>%
mutate(
units_sold = replace_na(units_sold, 0),
total_revenue = replace_na(total_revenue, 0),
avg_price = replace_na(avg_price, 0),
n_orders = replace_na(n_orders, 0),
sold_flag = units_sold > 0
)
cat("Wrangling done.\n")## Wrangling done.
cat(sprintf("Total products: %d | Sold: %d | Never sold: %d\n",
nrow(product_sales),
sum(product_sales$sold_flag),
sum(!product_sales$sold_flag)))## Total products: 32951 | Sold: 32216 | Never sold: 735
kpi <- tibble(
Metric = c("Total Orders", "Delivered Orders",
"Total Revenue (R$)", "Unique Products",
"Active Sellers", "Avg Review Score"),
Value = c(
comma(nrow(orders)),
comma(sum(orders$order_status == "delivered", na.rm = TRUE)),
dollar(sum(order_payments$payment_value, na.rm = TRUE),
prefix = "R$ ", big.mark = ","),
comma(nrow(products)),
comma(nrow(sellers)),
round(mean(order_reviews$review_score, na.rm = TRUE), 2)
)
)
kpi %>%
kbl(caption = "Key Business Metrics") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
column_spec(2, bold = TRUE, color = "#2c7bb6")| Metric | Value |
|---|---|
| Total Orders | 99,441 |
| Delivered Orders | 96,478 |
| Total Revenue (R\() </td> <td style="text-align:left;font-weight: bold;color: rgba(44, 123, 182, 255) !important;"> R\) 16,008,872 | |
| Unique Products | 32,951 |
| Active Sellers | 3,095 |
| Avg Review Score | 4.09 |
monthly_revenue <- order_payments %>%
left_join(orders %>% select(order_id, order_purchase_timestamp),
by = "order_id") %>%
filter(!is.na(order_purchase_timestamp)) %>%
mutate(month = floor_date(order_purchase_timestamp, "month")) %>%
group_by(month) %>%
summarise(revenue = sum(payment_value, na.rm = TRUE),
n_orders = n_distinct(order_id), .groups = "drop") %>%
filter(between(month, as.Date("2017-01-01"), as.Date("2018-09-01")))
p1 <- ggplot(monthly_revenue, aes(month, revenue)) +
geom_area(fill = "#3182bd", alpha = .3) +
geom_line(color = "#3182bd", linewidth = 1) +
geom_point(color = "#3182bd", size = 2) +
scale_y_continuous(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
labs(title = "Monthly Revenue", x = NULL, y = "Revenue (R$)") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(monthly_revenue, aes(month, n_orders)) +
geom_col(fill = "#e6550d", alpha = .8) +
scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
scale_y_continuous(labels = comma) +
labs(title = "Monthly Order Volume", x = NULL, y = "# Orders") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1 / p2orders %>%
count(order_status) %>%
mutate(pct = n / sum(n),
order_status = fct_reorder(order_status, n)) %>%
ggplot(aes(order_status, n, fill = order_status)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = percent(pct, .1)), hjust = -.1, size = 3.5) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(expand = expansion(mult = c(0, .15))) +
labs(title = "Order Status Distribution",
x = NULL, y = "# Orders") +
theme_minimal(base_size = 12)top_cat <- delivered_items %>%
count(category_en, wt = NULL, name = "units_sold") %>%
mutate(revenue = (delivered_items %>%
group_by(category_en) %>%
summarise(r = sum(revenue)))[["r"]][
match(category_en, (delivered_items %>%
group_by(category_en) %>%
summarise(r = sum(revenue)))[["category_en"]])]) %>%
arrange(desc(units_sold)) %>%
slice_head(n = 20)
# Re-compute cleanly
top_cat <- delivered_items %>%
group_by(category_en) %>%
summarise(units_sold = n(),
total_revenue = sum(revenue, na.rm = TRUE),
avg_price = mean(price, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(units_sold)) %>%
slice_head(n = 20)
ggplot(top_cat, aes(reorder(category_en, units_sold), units_sold,
fill = total_revenue)) +
geom_col() +
coord_flip() +
scale_fill_viridis_c(labels = dollar_format(prefix = "R$ ", big.mark = ","),
name = "Total Revenue") +
scale_y_continuous(labels = comma) +
labs(title = "Top 20 Product Categories by Units Sold",
subtitle = "Color = Total Revenue",
x = NULL, y = "Units Sold") +
theme_minimal(base_size = 12)cat_summary <- delivered_items %>%
group_by(category_en) %>%
summarise(units_sold = n(),
total_revenue = sum(revenue, na.rm = TRUE),
avg_price = mean(price, na.rm = TRUE),
.groups = "drop") %>%
filter(units_sold >= 50) # remove noise from tiny categories
med_units <- median(cat_summary$units_sold)
med_revenue <- median(cat_summary$total_revenue)
cat_summary <- cat_summary %>%
mutate(quadrant = case_when(
units_sold >= med_units & total_revenue >= med_revenue ~ "Stars",
units_sold >= med_units & total_revenue < med_revenue ~ "Volume Leaders",
units_sold < med_units & total_revenue >= med_revenue ~ "Premium Niche",
TRUE ~ "Underperformers"
))
ggplot(cat_summary, aes(units_sold, total_revenue,
color = quadrant, size = avg_price)) +
geom_point(alpha = .75) +
geom_vline(xintercept = med_units, linetype = "dashed", color = "grey50") +
geom_hline(yintercept = med_revenue, linetype = "dashed", color = "grey50") +
geom_text_repel(data = cat_summary %>% slice_max(units_sold, n = 10),
aes(label = category_en), size = 3, show.legend = FALSE) +
scale_x_log10(labels = comma) +
scale_y_log10(labels = dollar_format(prefix = "R$ ", big.mark = ",")) +
scale_color_brewer(palette = "Set1") +
labs(title = "Product Category Quadrant Analysis",
subtitle = "Axes are on log scale; size = average unit price",
x = "Units Sold (log)", y = "Total Revenue (log)",
color = "Quadrant", size = "Avg Price (R$)") +
theme_minimal(base_size = 12)delivered_items %>%
group_by(category_en) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE),
units_sold = n(), .groups = "drop") %>%
slice_max(total_revenue, n = 30) %>%
ggplot(aes(area = total_revenue, fill = units_sold,
label = paste0(category_en, "\n",
dollar(total_revenue, prefix = "R$", scale = 1e-3,
suffix = "K", big.mark = ",")))) +
geom_treemap() +
geom_treemap_text(colour = "white", place = "centre", grow = FALSE, size = 10) +
scale_fill_viridis_c(name = "Units Sold", labels = comma) +
labs(title = "Revenue Treemap – Top 30 Categories") +
theme_minimal()top_products <- delivered_items %>%
group_by(product_id) %>%
summarise(units_sold = n(),
total_revenue = sum(revenue, na.rm = TRUE),
avg_price = mean(price, na.rm = TRUE),
.groups = "drop") %>%
left_join(products %>% select(product_id, category_en),
by = "product_id") %>%
arrange(desc(units_sold)) %>%
slice_head(n = 20)
top_products %>%
select(product_id, category_en, units_sold, total_revenue, avg_price) %>%
mutate(total_revenue = dollar(total_revenue, prefix = "R$ ", big.mark = ","),
avg_price = dollar(avg_price, prefix = "R$ ")) %>%
datatable(caption = "Top 20 Best-Selling Products",
options = list(pageLength = 10),
rownames = FALSE,
colnames = c("Product ID","Category","Units Sold",
"Total Revenue","Avg Price"))orders_full %>%
filter(order_status == "delivered") %>%
mutate(hour = hour(order_purchase_timestamp),
dow = wday(order_purchase_timestamp, label = TRUE, week_start = 1)) %>%
count(dow, hour) %>%
ggplot(aes(hour, dow, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c(name = "# Orders", labels = comma) +
scale_x_continuous(breaks = 0:23) +
labs(title = "Purchase Heatmap: Day of Week × Hour",
x = "Hour of Day", y = NULL) +
theme_minimal(base_size = 12) +
theme(panel.grid = element_blank())sold_summary <- product_sales %>%
count(sold_flag) %>%
mutate(pct = n / sum(n),
label = ifelse(sold_flag, "Sold", "Never Sold"))
ggplot(sold_summary, aes(x = 2, y = pct, fill = label)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y", start = 0) +
xlim(.5, 2.5) +
geom_text(aes(label = paste0(label, "\n", percent(pct, .1))),
position = position_stack(vjust = .5), size = 5) +
scale_fill_manual(values = c("Never Sold" = "#d7301f",
"Sold" = "#41ab5d")) +
labs(title = "Product Catalogue: Sold vs. Never Sold") +
theme_void() +
theme(legend.position = "none")# categories with the most unsold products
unsold_cat <- product_sales %>%
group_by(category_en) %>%
summarise(total = n(),
unsold = sum(!sold_flag),
sold = sum(sold_flag),
unsold_pct= unsold / total,
.groups = "drop") %>%
filter(total >= 10) %>%
arrange(desc(unsold_pct))
# Top 15 categories by unsold %
p_unsold_pct <- unsold_cat %>%
slice_head(n = 15) %>%
ggplot(aes(reorder(category_en, unsold_pct), unsold_pct, fill = unsold)) +
geom_col() +
coord_flip() +
scale_fill_gradient(low = "#fdd0a2", high = "#d7301f", name = "# Unsold") +
scale_y_continuous(labels = percent) +
labs(title = "Top 15 Categories: Highest Unsold Rate",
x = NULL, y = "Unsold Rate") +
theme_minimal(base_size = 11)
# Top 15 categories by absolute # unsold
p_unsold_abs <- unsold_cat %>%
slice_max(unsold, n = 15) %>%
ggplot(aes(reorder(category_en, unsold), unsold, fill = unsold_pct)) +
geom_col() +
coord_flip() +
scale_fill_gradient(low = "#fdd0a2", high = "#d7301f",
labels = percent, name = "Unsold %") +
scale_y_continuous(labels = comma) +
labs(title = "Top 15 Categories: Most Unsold Products (Absolute)",
x = NULL, y = "# Unsold Products") +
theme_minimal(base_size = 11)
p_unsold_pct / p_unsold_abscompare_df <- product_sales %>%
filter(!is.na(product_weight_g)) %>%
mutate(group = ifelse(sold_flag, "Sold", "Never Sold"))
# Weight distribution
p_w <- ggplot(compare_df, aes(product_weight_g, fill = group)) +
geom_density(alpha = .5) +
scale_x_log10(labels = comma) +
scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
labs(title = "Product Weight Distribution",
x = "Weight (g, log scale)", fill = NULL) +
theme_minimal(base_size = 11)
# Photo count
p_ph <- compare_df %>%
filter(!is.na(product_photos_qty)) %>%
ggplot(aes(factor(pmin(product_photos_qty, 10)), fill = group)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("Never Sold" = "#d7301f", "Sold" = "#41ab5d")) +
scale_y_continuous(labels = percent) +
labs(title = "Photo Count (capped @ 10)",
x = "# Product Photos", y = "Proportion", fill = NULL) +
theme_minimal(base_size = 11)
(p_w + p_ph + plot_layout(guides = "collect")) &
theme(legend.position = "bottom")product_sales %>%
filter(!sold_flag) %>%
select(product_id, category_en,
product_weight_g, product_photos_qty) %>%
arrange(category_en) %>%
datatable(caption = "Never-Sold Products",
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE,
colnames = c("Product ID","Category","Weight (g)","# Photos"))reference_date <- as.Date("2018-10-18") # day after last purchase in dataset
rfm <- orders_full %>%
filter(order_status == "delivered",
!is.na(total_payment),
total_payment > 0) %>% # guard: no zero monetary
group_by(customer_unique_id) %>%
summarise(
recency = as.numeric(reference_date - max(as.Date(order_purchase_timestamp),
na.rm = TRUE)),
frequency = n_distinct(order_id),
monetary = sum(total_payment, na.rm = TRUE),
.groups = "drop"
) %>%
# drop any row where a value is NA, 0, negative, or Inf
filter(
if_all(c(recency, frequency, monetary),
~ !is.na(.) & is.finite(.) & . > 0)
)
# Log-transform -> scale -> drop any column with zero variance
rfm_log <- rfm %>%
mutate(across(c(recency, frequency, monetary), log1p)) %>%
select(recency, frequency, monetary)
# scale() returns a matrix; keep as plain numeric matrix for kmeans
rfm_scaled <- scale(rfm_log)
# Final safety: remove rows still NA/NaN/Inf after scaling
# (can happen if a column has zero variance -> sd=0 -> division by 0)
ok_rows <- complete.cases(rfm_scaled) &
apply(rfm_scaled, 1, function(x) all(is.finite(x)))
rfm_scaled <- rfm_scaled[ok_rows, ]
rfm <- rfm[ok_rows, ]
cat(sprintf("RFM table: %d customers (after cleaning)\n", nrow(rfm)))## RFM table: 93356 customers (after cleaning)
cat(sprintf("NA in rfm_scaled: %d | Inf: %d\n",
sum(is.na(rfm_scaled)), sum(!is.finite(rfm_scaled))))## NA in rfm_scaled: 0 | Inf: 0
## recency frequency monetary
## Min. : 50.0 Min. : 1.000 Min. : 9.59
## 1st Qu.:164.0 1st Qu.: 1.000 1st Qu.: 63.06
## Median :268.0 Median : 1.000 Median : 107.78
## Mean :287.5 Mean : 1.033 Mean : 165.20
## 3rd Qu.:396.0 3rd Qu.: 1.000 3rd Qu.: 182.57
## Max. :745.0 Max. :15.000 Max. :13664.08
corrplot(cor(rfm %>% select(recency, frequency, monetary)),
method = "color", addCoef.col = "black",
tl.col = "black", tl.srt = 45,
title = "RFM Feature Correlations",
mar = c(0,0,2,0))set.seed(42)
# fviz_nbclust iterates kmeans for every k on the full matrix -> O(n * k) RAM.
# With ~90k customers that blows memory. Sample 3 000 rows for diagnostics only;
# the final kmeans() below still runs on the full rfm_scaled.
SAMPLE_N <- 3000
sample_idx <- sample(nrow(rfm_scaled), min(SAMPLE_N, nrow(rfm_scaled)))
rfm_sample <- rfm_scaled[sample_idx, ]
# ── Elbow (WSS) ──────────────────────────────────────────────────────────────
wss_vals <- map_dbl(1:10, function(k) {
km <- kmeans(rfm_sample, centers = k, nstart = 10, iter.max = 100)
km$tot.withinss
})
tibble(k = 1:10, wss = wss_vals) %>%
ggplot(aes(k, wss)) +
geom_line(color = "#3182bd", linewidth = 1) +
geom_point(color = "#3182bd", size = 3) +
scale_x_continuous(breaks = 1:10) +
labs(title = "Elbow Method — Optimal k for K-Means",
subtitle = paste0("Computed on a random sample of ",
comma(nrow(rfm_sample)), " customers"),
x = "Number of clusters k", y = "Total Within-Cluster SS") +
theme_minimal(base_size = 12)# ── Silhouette ───────────────────────────────────────────────────────────────
# Use a smaller sample for silhouette (O(n^2) distance matrix)
SIL_N <- 1500
sil_idx <- sample(nrow(rfm_scaled), min(SIL_N, nrow(rfm_scaled)))
rfm_sil <- rfm_scaled[sil_idx, ]
sil_vals <- map_dbl(2:10, function(k) {
km <- kmeans(rfm_sil, centers = k, nstart = 10, iter.max = 100)
ss <- silhouette(km$cluster, dist(rfm_sil))
mean(ss[, "sil_width"])
})
tibble(k = 2:10, sil = sil_vals) %>%
ggplot(aes(k, sil)) +
geom_line(color = "#e6550d", linewidth = 1) +
geom_point(aes(color = sil == max(sil)), size = 3, show.legend = FALSE) +
scale_color_manual(values = c("FALSE" = "#e6550d", "TRUE" = "#31a354")) +
scale_x_continuous(breaks = 2:10) +
labs(title = "Silhouette Method — Optimal k",
subtitle = paste0("Green point = best k | sample of ",
comma(nrow(rfm_sil)), " customers"),
x = "Number of clusters k", y = "Average Silhouette Width") +
theme_minimal(base_size = 12)set.seed(42)
K <- 4
km_fit <- kmeans(rfm_scaled, centers = K, nstart = 50, iter.max = 300)
rfm <- rfm %>% mutate(cluster = factor(km_fit$cluster))
cat("Cluster sizes:\n")## Cluster sizes:
##
## 1 2 3 4
## 27886 37861 24808 2801
# PCA for 2-D projection
fviz_cluster(km_fit, data = rfm_scaled,
geom = "point", ellipse.type = "convex",
palette = "Set1", ggtheme = theme_minimal(base_size = 12),
main = "K-Means Customer Clusters (PCA Projection)")cluster_profile <- rfm %>%
group_by(cluster) %>%
summarise(
n_customers = n(),
avg_recency = round(mean(recency)),
avg_frequency= round(mean(frequency), 2),
avg_monetary = round(mean(monetary), 2),
.groups = "drop"
) %>%
mutate(
segment_label = case_when(
avg_recency < median(rfm$recency) & avg_frequency > 1 ~ "Champions",
avg_recency < median(rfm$recency) ~ "Promising",
avg_monetary > quantile(rfm$monetary, .75) ~ "At-Risk High Value",
TRUE ~ "Hibernating"
)
)
cluster_profile %>%
kbl(caption = "RFM Cluster Summary",
col.names = c("Cluster","# Customers","Avg Recency (days)",
"Avg Frequency","Avg Monetary (R$)","Segment")) %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE) %>%
column_spec(6, bold = TRUE, color = "white",
background = c("#3182bd","#41ab5d","#d7301f","#fd8d3c"))| Cluster | # Customers | Avg Recency (days) | Avg Frequency | Avg Monetary (R$) | Segment |
|---|---|---|---|---|---|
| 1 | 27886 | 335 | 1.00 | 318.87 | At-Risk High Value |
| 2 | 37861 | 366 | 1.00 | 69.36 | Hibernating |
| 3 | 24808 | 117 | 1.00 | 122.54 | Promising |
| 4 | 2801 | 270 | 2.11 | 308.59 | At-Risk High Value |