# Title: E-commerce Customer Segmentation and Order Volume Prediction
# Introduction
# This project aims to explore a real-world e-commerce dataset from Pakistan's largest online platform. The primary objective is to apply machine learning techniques to:
# - Classify customers based on purchasing behavior.
# - Predict future product demand in terms of order volume.
# Through this analysis, our project seeks to provide actionable insights in terms of marketing strategies and inventory planning.
# Load necessary packages
library(readxl)
library(dplyr)
library(lubridate)
library(stringr)
# Read the Excel file
df <- read_excel("C:/Users/A/Downloads/Pakistan Largest Ecommerce Dataset Updated.xlsx")
# Assign df to raw_data
raw_data <- df
# Remove completely blank rows based on item_id
raw_data <- raw_data %>%
filter(!is.na(item_id))
cat("Row count after removing blanks:", nrow(raw_data), "\n")
# Remove redundant columns
raw_data <- raw_data %>%
select(-grand_total, -sales_commission_code, -`Working Date`)
# Count invalid values
invalid_values <- c("#REF!", "\\N", "", "#N/A")
count_invalids <- function(df) {
sapply(df, function(col) {
sum(
is.na(col) | trimws(as.character(col)) %in% invalid_values
)
})
}
invalid_counts <- count_invalids(raw_data)
cat("Invalid row counts per column:\n")
print(invalid_counts)
# Remove invalid rows (excluding category_name_1)
columns_to_check <- setdiff(names(raw_data), "category_name_1")
invalid_row_mask <- apply(raw_data[columns_to_check], 1, function(row) {
any(
is.na(row) | trimws(as.character(row)) %in% invalid_values
)
})
clean_data <- raw_data[!invalid_row_mask, ]
cat("Row count after removing invalid rows:", nrow(clean_data), "\n")
print(count(clean_data))
# Tag rows for category imputation
clean_data <- clean_data %>%
mutate(
category_status = ifelse(
trimws(as.character(category_name_1)) %in% c("", "\\N"),
"Need to impute",
"Have category"
)
)
# SKU cleaning
clean_data$sku <- trimws(clean_data$sku)
clean_data <- clean_data %>%
mutate(
sku_prefix = sapply(sku, function(x) {
x <- as.character(x)
if (is.na(x)) return(NA)
parts <- unlist(strsplit(x, "[-_ ]"))
if (length(parts) == 0) return(NA)
if (nchar(parts[1]) == 1 && length(parts) > 1) {
return(paste0(parts[1], "_", parts[2]))
} else {
return(parts[1])
}
})
)
# Create sku_prefix_2
clean_data <- clean_data %>%
mutate(
sku_prefix_2 = ifelse(
nchar(sku_prefix) >= 16 & nchar(sku_prefix) <= 19,
substr(sku_prefix, 1, 3),
tolower(sku_prefix)
)
)
# Create mapping table for imputation
prefix_category_table <- clean_data %>%
filter(category_status == "Have category") %>%
group_by(sku_prefix_2, category_name_1) %>%
summarise(distinct_sku_count = n_distinct(sku), .groups = "drop")
final_prefix_category_table <- prefix_category_table %>%
group_by(sku_prefix_2) %>%
slice_max(order_by = distinct_sku_count, n = 1, with_ties = FALSE) %>%
ungroup()
# Impute category_name_1
clean_data <- clean_data %>%
left_join(final_prefix_category_table %>% select(sku_prefix_2, imputed_category = category_name_1),
by = "sku_prefix_2") %>%
mutate(
category_final = ifelse(
category_status == "Have category",
category_name_1,
imputed_category
)
)
# Final cleaning
columns_to_check <- setdiff(names(clean_data), "category_name_1")
invalid_row_mask <- apply(clean_data[columns_to_check], 1, function(row) {
any(
is.na(row) | trimws(as.character(row)) %in% invalid_values
)
})
clean_data <- clean_data[!invalid_row_mask, ]
cat("Final row count after full cleaning:", nrow(clean_data), "\n")
print(count(clean_data))
# Final invalid check
invalid_counts <- count_invalids(clean_data)
cat("Final invalid counts:\n")
print(invalid_counts)
# View the cleaned data
print(clean_data)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
Row count after removing blanks: 584524
Invalid row counts per column:
item_id status created_at sku price
0 19 0 20 0
qty_ordered category_name_1 discount_amount payment_method BI Status
0 8014 0 0 1
Year Month Customer Since M-Y FY
0 0 11 0 0
Customer ID
11
Row count after removing invalid rows: 584473
[90m# A tibble: 1 × 1[39m
n
[3m[90m<int>[39m[23m
[90m1[39m [4m5[24m[4m8[24m[4m4[24m473
Final row count after full cleaning: 583996
[90m# A tibble: 1 × 1[39m
n
[3m[90m<int>[39m[23m
[90m1[39m [4m5[24m[4m8[24m[4m3[24m996
Final invalid counts:
item_id status created_at sku
0 0 0 0
price qty_ordered category_name_1 discount_amount
0 0 7516 0
payment_method BI Status Year Month
0 0 0 0
Customer Since M-Y FY Customer ID
0 0 0 0
category_status sku_prefix sku_prefix_2 imputed_category
0 0 0 0
category_final
0
[90m# A tibble: 583,996 × 21[39m
item_id status created_at sku price qty_ordered category_name_1
[3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m [3m[90m<dttm>[39m[23m [3m[90m<chr>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m
[90m 1[39m [4m2[24m[4m1[24m[4m1[24m133 canceled 2016-07-01 [90m00:00:00[39m [90m"[39mkcc… 240 1 Beauty & Groom…
[90m 2[39m [4m2[24m[4m1[24m[4m1[24m134 canceled 2016-07-01 [90m00:00:00[39m [90m"[39mEgo… [4m2[24m450 1 Women's Fashion
[90m 3[39m [4m2[24m[4m1[24m[4m1[24m135 complete 2016-07-01 [90m00:00:00[39m [90m"[39mkcc… 360 1 Beauty & Groom…
[90m 4[39m [4m2[24m[4m1[24m[4m1[24m136 order_re… 2016-07-01 [90m00:00:00[39m [90m"[39mBK7… 555 2 Soghaat
[90m 5[39m [4m2[24m[4m1[24m[4m1[24m137 canceled 2016-07-01 [90m00:00:00[39m [90m"[39mUK_… 80 1 Soghaat
[90m 6[39m [4m2[24m[4m1[24m[4m1[24m138 complete 2016-07-01 [90m00:00:00[39m [90m"[39mkcc… 360 1 Beauty & Groom…
[90m 7[39m [4m2[24m[4m1[24m[4m1[24m139 complete 2016-07-01 [90m00:00:00[39m [90m"[39mUK_… 170 1 Soghaat
[90m 8[39m [4m2[24m[4m1[24m[4m1[24m140 canceled 2016-07-01 [90m00:00:00[39m [90m"[39mApp… [4m9[24m[4m6[24m499 1 Mobiles & Tabl…
[90m 9[39m [4m2[24m[4m1[24m[4m1[24m141 canceled 2016-07-01 [90m00:00:00[39m [90m"[39mApp… [4m9[24m[4m6[24m499 1 Mobiles & Tabl…
[90m10[39m [4m2[24m[4m1[24m[4m1[24m142 complete 2016-07-01 [90m00:00:00[39m [90m"[39mGFC… [4m5[24m500 1 Appliances
[90m# ℹ 583,986 more rows[39m
[90m# ℹ 14 more variables: discount_amount <dbl>, payment_method <chr>,[39m
[90m# `BI Status` <chr>, Year <dbl>, Month <dbl>, `Customer Since` <chr>,[39m
[90m# `M-Y` <dttm>, FY <chr>, `Customer ID` <dbl>, category_status <chr>,[39m
[90m# sku_prefix <chr>, sku_prefix_2 <chr>, imputed_category <chr>,[39m
[90m# category_final <chr>[39m
library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(scales)
library(tidyr)
library(ggcorrplot)
# Load Excel file
file_path <- "C:\\Users\\A\\Downloads\\Pakistan Largest Ecommerce Dataset Updated.xlsx"
df <- read_excel(file_path, sheet = "Scrubbed Data")
# Change date format
df_raw <- read_excel(file_path, sheet = "Scrubbed Data", col_types = "text")
df_raw$created_at <- as.Date(as.numeric(df_raw$created_at), origin = "1899-12-30")
df <- df_raw %>%
mutate(
`M-Y` = format(created_at, "%Y-%m"),
day_of_week = weekdays(created_at)
)
cat("✅ Dataset Overview:\n")
cat("Total Rows:", nrow(df), "\n")
cat("Total Columns:", ncol(df), "\n")
cat("Unique SKUs:", n_distinct(df$sku), "\n")
cat("Unique Customers:", n_distinct(df$`Customer ID`), "\n")
cat("Date Range:", as.character(min(df$created_at, na.rm = TRUE)),
"to", as.character(max(df$created_at, na.rm = TRUE)), "\n")
best_selling <- df %>%
group_by(category_name_1) %>%
summarise(item_count = n()) %>%
arrange(item_count)
# Plot: Best Selling Categories
ggplot(best_selling, aes(x = item_count, y = reorder(category_name_1, item_count))) +
geom_col(fill = "skyblue") +
geom_text(aes(label = paste0(round(item_count / 1000), "k")), hjust = -0.1) +
labs(
title = "Amount Transaction Product Category",
x = "Transactions",
y = "Category Product"
) +
scale_x_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
theme_minimal()
# Recalculate day_of_week if not already added
if (!"day_of_week" %in% colnames(df)) {
df$day_of_week <- weekdays(df$created_at)
}
# Count orders by day
daywise_total <- df %>%
filter(!is.na(day_of_week)) %>%
count(day_of_week)
# Plot
ggplot(daywise_total, aes(x = reorder(day_of_week, -n), y = n)) +
geom_col(fill = "coral") +
labs(title = "Orders by Day of the Week", x = "Day", y = "Order Count") +
theme_minimal()
# Ensure columns are numeric
df$qty_ordered <- as.numeric(df$qty_ordered)
# Extract Year and Month
df <- df %>%
mutate(
Year = year(created_at),
Month = month(created_at)
)
# Group by Year & Month
monthly_by_year <- df %>%
filter(!is.na(created_at)) %>%
group_by(Year, Month) %>%
summarise(total_qty = sum(qty_ordered, na.rm = TRUE), .groups = "drop")
# Line plot: one line per year
ggplot(monthly_by_year, aes(x = Month, y = total_qty, color = factor(Year))) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
scale_x_continuous(breaks = 1:12) +
labs(title = "Amount Transactions Every Month",
x = "Month",
y = "Transactions",
color = "Year") +
theme_minimal()
# Ensure columns are numeric
df$discount_amount <- as.numeric(df$discount_amount)
df$created_at <- as.POSIXct(df$created_at) # if not already
df$item_id <- as.character(df$sku) # assuming 'sku' is item_id
# Extract Year and Month
df <- df %>%
mutate(Year = year(created_at),
Month = month(created_at))
# Count non-zero discount transactions per Year-Month
discount_df <- df %>%
filter(discount_amount != 0) %>%
group_by(Year, Month) %>%
summarise(count_discount = n(), .groups = "drop")
# Count all transactions per Year-Month
transaction_df <- df %>%
group_by(Year, Month) %>%
summarise(transactions = n(), .groups = "drop")
# Merge discount into transaction summary
merged_df <- left_join(transaction_df, discount_df, by = c("Year", "Month")) %>%
mutate(count_discount = replace_na(count_discount, 0))
# Correlation matrix
correlation_matrix <- merged_df %>%
select(Year, Month, transactions, count_discount) %>%
cor(use = "complete.obs")
# Monthly totals for plotting
monthly_plot_df <- merged_df %>%
group_by(Month) %>%
summarise(Transactions = sum(transactions),
Discount = sum(count_discount), .groups = "drop") %>%
pivot_longer(cols = c(Transactions, Discount),
names_to = "Metric", values_to = "Count")
# Correlation heatmap
ggcorrplot(correlation_matrix,
lab = TRUE,
method = "square",
title = "Heatmap Correlation")
# Line plot of monthly totals
ggplot(monthly_plot_df, aes(x = Month, y = Count, color = Metric, group = Metric)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
scale_x_continuous(breaks = 1:12) +
scale_y_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
labs(title = "Monthly Transactions & Discounts",
x = "Month", y = "Transaction Count") +
scale_color_manual(values = c("Transactions" = "red", "Discount" = "blue")) +
theme_minimal()
payment_method <- df %>%
group_by(payment_method) %>%
summarise(item_count = n()) %>%
arrange(item_count)
# Visualization
ggplot(payment_method, aes(x = item_count, y = reorder(payment_method, item_count))) +
geom_col(fill = "steelblue") +
geom_text(aes(label = paste0(round(item_count / 1000), "k")), hjust = -0.1) +
labs(
title = "Amount Payment Method Used",
x = "Transaction",
y = "Payment Method"
) +
scale_x_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
theme_minimal()
✅ Dataset Overview:
Total Rows: 584524
Total Columns: 20
Unique SKUs: 84800
Unique Customers: 115327
Date Range: 2016-07-01 to 2018-08-28
# ----------------------
# Load Required Libraries
# ----------------------
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(caret)
library(randomForest)
library(xgboost)
library(ggplot2)
# ----------------------
# Load and Prepare Data
# ----------------------
file_path <- "C:/Users/A/Downloads/Pakistan Largest Ecommerce Dataset Updated.xlsx"
df <- read_excel(file_path, sheet = "Scrubbed Data")
# Rename relevant columns (match Excel file exactly)
df <- df %>%
rename(
customer_id = `Customer ID`,
category_final = `category_name_1`
)
# Convert necessary columns
df$customer_id <- as.character(df$customer_id)
df$created_at <- as.Date(df$created_at)
# ----------------------
# Customer Summary
# ----------------------
customer_summary <- df %>%
filter(status %in% c("complete", "paid", "received")) %>%
group_by(customer_id) %>%
summarise(
total_orders = n(),
total_quantity = sum(qty_ordered, na.rm = TRUE),
total_spent = sum(price * qty_ordered, na.rm = TRUE),
avg_order_value = total_spent / total_orders,
unique_categories = n_distinct(category_final),
most_frequent_category = {
valid_cats <- category_final[!is.na(category_final)]
if (length(valid_cats) == 0) NA_character_ else names(which.max(table(valid_cats)))
}
)
# ----------------------
# Segment Based on Spending
# ----------------------
p33 <- quantile(customer_summary$total_spent, 0.33, na.rm = TRUE)
p66 <- quantile(customer_summary$total_spent, 0.66, na.rm = TRUE)
customer_summary <- customer_summary %>%
mutate(
segment = case_when(
total_spent <= p33 ~ "Low",
total_spent <= p66 ~ "Medium",
TRUE ~ "High"
)
)
# View count of each segment
print(table(customer_summary$segment))
# Segment-wise summary stats
segment_stats <- customer_summary %>%
group_by(segment) %>%
summarise(
n_customers = n(),
avg_spent = mean(total_spent),
avg_order_value = mean(avg_order_value),
avg_quantity = mean(total_quantity)
)
print(segment_stats)
# ----------------------
# First 3 Months Analysis
# ----------------------
first_order_dates <- df %>%
group_by(customer_id) %>%
summarise(first_order = min(created_at))
df <- df %>%
left_join(first_order_dates, by = "customer_id") %>%
mutate(
months_since_first = interval(first_order, created_at) %/% months(1)
)
early_data <- df %>%
filter(months_since_first <= 3 & status %in% c("complete", "paid", "received"))
early_summary <- early_data %>%
group_by(customer_id) %>%
summarise(
total_orders = n(),
total_quantity = sum(qty_ordered, na.rm = TRUE),
total_spent = sum(price * qty_ordered, na.rm = TRUE),
avg_order_value = total_spent / total_orders,
unique_categories = n_distinct(category_final)
)
# ----------------------
# Prepare Model Data
# ----------------------
model_data <- early_summary %>%
left_join(customer_summary %>% select(customer_id, segment), by = "customer_id") %>%
filter(!is.na(segment))
model_data$segment <- as.factor(model_data$segment)
model_data <- model_data %>% select(-customer_id)
# ----------------------
# Split Data
# ----------------------
set.seed(123)
train_index <- createDataPartition(model_data$segment, p = 0.8, list = FALSE)
train <- model_data[train_index, ]
test <- model_data[-train_index, ]
# ----------------------
# Train XGBoost Classifier
# ----------------------
grid <- expand.grid(
nrounds = 20,
max_depth = 2,
eta = 0.3,
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 3,
subsample = 0.8
)
control <- trainControl(method = "cv", number = 3)
system.time({
model <- train(
segment ~ .,
data = train,
method = "xgbTree",
trControl = control,
tuneGrid = grid
)
})
# ----------------------
# Evaluate Model
# ----------------------
pred <- predict(model, newdata = test)
conf_matrix <- confusionMatrix(pred, test$segment)
print(conf_matrix)
# Variable Importance
print(varImp(model))
Loading required package: lattice
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 'randomForest'
The following object is masked from 'package:ggplot2':
margin
The following object is masked from 'package:dplyr':
combine
Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':
slice
High Low Medium
27088 26301 26282
[90m# A tibble: 3 × 5[39m
segment n_customers avg_spent avg_order_value avg_quantity
[3m[90m<chr>[39m[23m [3m[90m<int>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m
[90m1[39m High [4m2[24m[4m7[24m088 [4m4[24m[4m7[24m203. [4m1[24m[4m3[24m468. 9.90
[90m2[39m Low [4m2[24m[4m6[24m301 705. 585. 1.50
[90m3[39m Medium [4m2[24m[4m6[24m282 [4m2[24m740. [4m1[24m602. 3.18
user system elapsed
2.75 0.99 1.71
Confusion Matrix and Statistics
Reference
Prediction High Low Medium
High 4821 0 2
Low 113 5182 245
Medium 358 0 4908
Overall Statistics
Accuracy : 0.9541
95% CI : (0.9507, 0.9573)
No Information Rate : 0.3386
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9311
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: High Class: Low Class: Medium
Sensitivity 0.9110 1.0000 0.9521
Specificity 0.9998 0.9657 0.9658
Pos Pred Value 0.9996 0.9354 0.9320
Neg Pred Value 0.9564 1.0000 0.9762
Prevalence 0.3386 0.3316 0.3298
Detection Rate 0.3085 0.3316 0.3140
Detection Prevalence 0.3086 0.3545 0.3369
Balanced Accuracy 0.9554 0.9829 0.9590
xgbTree variable importance
Overall
total_spent 100.0000
avg_order_value 12.8109
total_quantity 0.9547
total_orders 0.8830
unique_categories 0.0000
# ----------------------
# Libraries for Visuals and Classification
# ----------------------
library(ggplot2)
library(tidyr)
library(dplyr)
library(caret)
library(randomForest)
library(lubridate)
# ----------------------
# Classification Metrics Plot (Customer)
# ----------------------
class_metrics <- data.frame(
Class = c("High", "Low", "Medium"),
Sensitivity = c(0.9696, 1.0000, 0.9587),
Specificity = c(1.0000, 0.9721, 0.9929),
Precision = c(1.0000, 0.9664, 0.9844),
Balanced_Accuracy = c(0.9848, 0.9860, 0.9758)
)
class_metrics_long <- class_metrics %>%
pivot_longer(cols = -Class, names_to = "Metric", values_to = "Value")
ggplot(class_metrics_long, aes(x = Class, y = Value, fill = Metric)) +
geom_col(position = position_dodge()) +
geom_text(aes(label = sprintf("%.2f", Value)),
position = position_dodge(width = 0.9), vjust = -0.3, size = 3) +
scale_fill_brewer(palette = "Set2") +
ylim(0, 1.1) +
labs(title = "Classification Metrics by Segment",
y = "Score",
x = "Customer Segment",
fill = "Metric") +
theme_minimal(base_size = 13)
# ----------------------
# Confusion Matrix Plot (Customer)
# ----------------------
# Assumes `pred` and `test$segment` already exist from previous model
cm <- confusionMatrix(pred, test$segment)
cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Actual", "Predicted", "Freq")
ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 0.5, size = 5) +
scale_fill_gradient(low = "lightblue", high = "steelblue") +
labs(title = "Confusion Matrix (Customer Segment Prediction)",
x = "Actual Segment", y = "Predicted Segment") +
theme_minimal(base_size = 13)
# ----------------------
# SKU SEGMENTATION CLASSIFICATION
# ----------------------
# Parse created_at safely
df$created_at <- parse_date_time(df$created_at, orders = c("ymd", "dmy", "mdy", "Ymd HMS", "ymd HMS"))
df <- df %>% filter(!is.na(created_at)) # remove rows with invalid dates
# SKU-level summary
item_summary <- df %>%
filter(status %in% c("complete", "paid", "received")) %>%
group_by(sku) %>%
summarise(
total_orders = n(),
total_quantity = sum(qty_ordered, na.rm = TRUE),
total_revenue = sum(price * qty_ordered, na.rm = TRUE),
avg_price = mean(price, na.rm = TRUE),
unique_buyers = n_distinct(customer_id),
num_days_sold = n_distinct(as.Date(created_at)),
category = first(na.omit(category_final)),
.groups = "drop"
)
# Revenue-based segmentation
p33 <- quantile(item_summary$total_revenue, 0.33, na.rm = TRUE)
p66 <- quantile(item_summary$total_revenue, 0.66, na.rm = TRUE)
item_summary <- item_summary %>%
mutate(
segment = case_when(
total_revenue <= p33 ~ "Low",
total_revenue <= p66 ~ "Medium",
TRUE ~ "High"
),
segment = factor(segment, levels = c("Low", "Medium", "High"))
)
print(table(item_summary$segment))
# Prepare modeling data
model_item_data <- item_summary %>%
select(-sku, -category)
# Train/test split
set.seed(123)
train_index <- createDataPartition(model_item_data$segment, p = 0.8, list = FALSE)
train_item <- model_item_data[train_index, ]
test_item <- model_item_data[-train_index, ]
# Train Random Forest
item_model <- randomForest(segment ~ ., data = train_item, ntree = 100)
# Predict & evaluate
pred_item <- predict(item_model, newdata = test_item)
conf_matrix_item <- confusionMatrix(pred_item, test_item$segment)
print(conf_matrix_item)
# Feature importance plot
varImpPlot(item_model, main = "Feature Importance - Item Segmentation")
# Bar plot of item segments
ggplot(item_summary, aes(x = segment, fill = segment)) +
geom_bar() +
labs(title = "Item Segment Distribution",
x = "Item Segment", y = "Number of Items") +
scale_fill_brewer(palette = "Set2") +
theme_minimal()
Low Medium High
20851 20858 21351
Confusion Matrix and Statistics
Reference
Prediction Low Medium High
Low 4170 0 0
Medium 0 4171 0
High 0 0 4270
Overall Statistics
Accuracy : 1
95% CI : (0.9997, 1)
No Information Rate : 0.3386
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 1
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Low Class: Medium Class: High
Sensitivity 1.0000 1.0000 1.0000
Specificity 1.0000 1.0000 1.0000
Pos Pred Value 1.0000 1.0000 1.0000
Neg Pred Value 1.0000 1.0000 1.0000
Prevalence 0.3307 0.3307 0.3386
Detection Rate 0.3307 0.3307 0.3386
Detection Prevalence 0.3307 0.3307 0.3386
Balanced Accuracy 1.0000 1.0000 1.0000
# ------------------------
# Load Required Libraries
# ------------------------
library(readxl)
library(dplyr)
library(lubridate)
library(tidyr)
library(rpart)
library(rpart.plot)
library(tidyverse) # includes ggplot2, dplyr, readr, etc.
# Convert date column
df$created_at <- as.Date(df$created_at, format = "%m/%d/%Y")
head(df$created_at)
# ------------------------
# Feature Engineering
# ------------------------
df <- df %>%
mutate(
month = month(created_at),
year = year(created_at),
month_year = floor_date(created_at, "month"),
is_sale_month = ifelse(month %in% c(11, 12), 1, 0), # Nov/Dec sale period
is_cod = ifelse(payment_method == "cod", 1, 0)
)
# ------------------------
# Top 3 Categories by Order Volume
# ------------------------
top_categories <- df %>%
count(category_final, sort = TRUE) %>%
top_n(3, n) %>%
pull(category_final)
df_top <- df %>% filter(category_final %in% top_categories)
# ------------------------
# Aggregate Monthly Data
# ------------------------
monthly_data <- df_top %>%
group_by(category_final, month_year, month, year) %>%
summarise(
order_volume = n(),
sale_month = first(is_sale_month),
cod_ratio = mean(is_cod, na.rm = TRUE),
.groups = 'drop'
)
# ------------------------
# Split Data into Train/Test
# ------------------------
set.seed(42)
train_index <- 1:round(0.7 * nrow(monthly_data))
train <- monthly_data[train_index, ]
test <- monthly_data[-train_index, ]
# ------------------------
# Decision Tree Model (Raw)
# ------------------------
tree_model <- rpart(
order_volume ~ month + year + sale_month + cod_ratio + category_final,
data = train,
method = "anova",
control = rpart.control(cp = 0.01)
)
pred <- predict(tree_model, newdata = test)
actual <- test$order_volume
# MAPE (manual)
mape <- mean(abs((actual - pred) / actual)) * 100
rmse <- sqrt(mean((actual - pred)^2))
cat("MAPE:", round(mape, 2), "%\n")
cat("RMSE:", round(rmse, 2), "\n")
# ------------------------
# Clean Numeric Columns (optional, safety)
# ------------------------
train$order_volume <- as.numeric(train$order_volume)
test$order_volume <- as.numeric(test$order_volume)
# ------------------------
# Log Transform Target
# ------------------------
train$log_order_volume <- log1p(train$order_volume)
test$log_order_volume <- log1p(test$order_volume)
# ------------------------
# Evaluation Function
# ------------------------
evaluate_model <- function(actual, predicted, label) {
valid <- which(!is.na(actual) & !is.na(predicted) & is.finite(actual) & actual != 0)
if (length(valid) == 0) {
cat("⚠️", label, "- No valid data to evaluate\n\n")
return()
}
actual_clean <- actual[valid]
predicted_clean <- predicted[valid]
mape <- mean(abs((actual_clean - predicted_clean) / actual_clean)) * 100
rmse <- sqrt(mean((actual_clean - predicted_clean)^2))
cat(paste0("Model: ", label, "\n"))
cat("MAPE: ", round(mape, 2), "%\n")
cat("RMSE:", round(rmse, 2), "\n\n")
}
# ------------------------
# Decision Tree (Log Target)
# ------------------------
dt_model <- rpart(
log_order_volume ~ month + year + sale_month + cod_ratio,
data = train,
method = "anova"
)
dt_pred_log <- predict(dt_model, newdata = test)
dt_pred <- expm1(dt_pred_log)
evaluate_model(test$order_volume, dt_pred, "Decision Tree")
# ------------------------
# Linear Regression (Log Target)
# ------------------------
lm_model <- lm(
log_order_volume ~ month + year + sale_month + cod_ratio,
data = train
)
lm_pred_log <- predict(lm_model, newdata = test)
lm_pred <- expm1(lm_pred_log)
evaluate_model(test$order_volume, lm_pred, "Linear Regression")
# ------------------------
# Plot Tree
# ------------------------
rpart.plot(dt_model, main = "Regression Tree for Order Volume")
── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats[39m 1.0.0 [32m✔[39m [34mreadr [39m 2.1.5
[32m✔[39m [34mpurrr [39m 1.0.4 [32m✔[39m [34mtibble [39m 3.3.0
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mreadr[39m::[32mcol_factor()[39m masks [34mscales[39m::col_factor()
[31m✖[39m [34mrandomForest[39m::[32mcombine()[39m masks [34mdplyr[39m::combine()
[31m✖[39m [34mpurrr[39m::[32mdiscard()[39m masks [34mscales[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mlift()[39m masks [34mcaret[39m::lift()
[31m✖[39m [34mrandomForest[39m::[32mmargin()[39m masks [34mggplot2[39m::margin()
[31m✖[39m [34mxgboost[39m::[32mslice()[39m masks [34mdplyr[39m::slice()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
MAPE: 107.06 %
RMSE: 2648.97
Model: Decision Tree
MAPE: 76.53 %
RMSE: 1775.14
Model: Linear Regression
MAPE: 62.07 %
RMSE: 1409.72
# Ensure required libraries
library(dplyr)
library(lubridate)
library(caret)
# Re-create key columns in clean_data BEFORE split
clean_data <- clean_data %>%
mutate(
created_at = as.Date(created_at),
order_volume = as.numeric(qty_ordered),
log_order_volume = log1p(order_volume),
year = year(created_at),
month = month(created_at, label = TRUE),
sale_month = ifelse(month %in% c("Nov", "Dec"), 1, 0),
cod_ratio = ifelse(payment_method == "COD", 1, 0)
)
# ✅ Check that sale_month now exists
stopifnot("sale_month" %in% colnames(clean_data))
# Train-Test Split
set.seed(123)
split <- createDataPartition(clean_data$log_order_volume, p = 0.8, list = FALSE)
train <- clean_data[split, ]
test <- clean_data[-split, ]
# ✅ Confirm sale_month still exists after split
stopifnot("sale_month" %in% colnames(train))
stopifnot("sale_month" %in% colnames(test))
# -----------------------
# Load Required Libraries
# -----------------------
library(dplyr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(xgboost)
# -----------------------
# Decision Tree (Tuned)
# -----------------------
control_params <- rpart.control(
cp = 0.001,
maxdepth = 10,
minsplit = 20,
minbucket = 5
)
dt_model_tuned <- rpart(
log_order_volume ~ month + year + sale_month + cod_ratio,
data = train,
method = "anova",
control = control_params
)
# Predict & Evaluate
dt_pred_log <- predict(dt_model_tuned, newdata = test)
dt_pred <- expm1(dt_pred_log)
actual <- test$order_volume
pred <- dt_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100
cat("Tuned Decision Tree RMSE:", round(rmse, 2), "\n")
cat("Tuned Decision Tree MAPE:", round(mape, 2), "%\n")
# -----------------------
# Decision Tree (Pruned)
# -----------------------
best_cp <- dt_model_tuned$cptable[which.min(dt_model_tuned$cptable[,"xerror"]), "CP"]
dt_model_pruned <- prune(dt_model_tuned, cp = best_cp)
dt_pruned_pred_log <- predict(dt_model_pruned, newdata = test)
dt_pruned_pred <- expm1(dt_pruned_pred_log)
actual <- test$order_volume
pred <- dt_pruned_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100
cat("Pruned Tree RMSE:", round(rmse, 2), "\n")
cat("Pruned Tree MAPE:", round(mape, 2), "%\n")
# -----------------------
# Random Forest
# -----------------------
rf_model <- randomForest(
log_order_volume ~ month + year + sale_month + cod_ratio,
data = train,
ntree = 500,
mtry = 2,
importance = TRUE
)
rf_pred_log <- predict(rf_model, newdata = test)
rf_pred <- expm1(rf_pred_log)
actual <- test$order_volume
pred <- rf_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100
cat("Random Forest RMSE:", round(rmse, 2), "\n")
cat("Random Forest MAPE:", round(mape, 2), "%\n")
# -----------------------
# Decision Tree (with category_final)
# -----------------------
# Make sure category_final exists (e.g., from renamed 'category_name_1')
if ("category_final" %in% names(train)) {
dt_model_cat <- rpart(
log_order_volume ~ category_final + month + year + sale_month + cod_ratio,
data = train,
method = "anova"
)
dt_cat_pred_log <- predict(dt_model_cat, newdata = test)
dt_cat_pred <- expm1(dt_cat_pred_log)
actual <- test$order_volume
pred <- dt_cat_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100
cat("Tree w/ Category RMSE:", round(rmse, 2), "\n")
cat("Tree w/ Category MAPE:", round(mape, 2), "%\n")
}
# -----------------------
# XGBoost
# -----------------------
# Ensure 'created_at' is in Date format and create required features
clean_data <- clean_data %>%
mutate(
created_at = as.Date(created_at),
order_volume = as.numeric(qty_ordered),
log_order_volume = log1p(order_volume),
year = year(created_at),
month = month(created_at, label = TRUE),
sale_month = ifelse(month %in% c("Nov", "Dec"), 1, 0), # sale_month created here
cod_ratio = ifelse(payment_method == "COD", 1, 0) # dummy example if needed
)
# --- Train-Test Split ---
set.seed(123)
split <- createDataPartition(clean_data$log_order_volume, p = 0.8, list = FALSE)
train <- clean_data[split, ]
test <- clean_data[-split, ]
# --- Ensure 'sale_month' column exists in train and test (safety check) ---
if (!"sale_month" %in% colnames(train)) {
train$sale_month <- ifelse(train$month %in% c("Nov", "Dec", 11, 12), 1, 0)
}
if (!"sale_month" %in% colnames(test)) {
test$sale_month <- ifelse(test$month %in% c("Nov", "Dec", 11, 12), 1, 0)
}
# --- Convert month to numeric if needed ---
train$month <- as.numeric(month(train$created_at))
test$month <- as.numeric(month(test$created_at))
# --- Define predictors ---
predictors <- c("month", "year", "sale_month", "cod_ratio")
# --- Prepare data matrices ---
train_matrix <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
test_matrix <- xgb.DMatrix(data = as.matrix(test[, predictors]))
# --- Train XGBoost Model ---
xgb_model <- xgboost(
data = train_matrix,
nrounds = 100,
objective = "reg:squarederror",
verbose = 0
)
# --- Predict and Evaluate ---
xgb_pred_log <- predict(xgb_model, newdata = test_matrix)
xgb_pred <- expm1(xgb_pred_log)
# Evaluation metrics
actual <- test$order_volume
pred <- xgb_pred
rmse <- sqrt(mean((actual - pred)^2, na.rm = TRUE))
mape <- mean(abs((actual - pred) / actual), na.rm = TRUE) * 100
cat("XGBoost RMSE:", round(rmse, 2), "\n")
cat("XGBoost MAPE:", round(mape, 2), "%\n")
Tuned Decision Tree RMSE: 4.69
Tuned Decision Tree MAPE: 20.7 %
Pruned Tree RMSE: 4.69
Pruned Tree MAPE: 20.7 %
Random Forest RMSE: 4.7
Random Forest MAPE: 21.06 %
Tree w/ Category RMSE: 4.68
Tree w/ Category MAPE: 17.26 %
XGBoost RMSE: 4.69
XGBoost MAPE: 20.57 %
# Load libraries
library(xgboost)
library(Matrix)
library(dplyr)
# -------- PREPARE DATA --------
# Ensure predictors exist
predictors <- c("month", "year", "cod_ratio", "sale_month") # or adapt based on your data
train_x <- train[, predictors]
test_x <- test[, predictors]
# Convert to matrix
dtrain <- xgb.DMatrix(data = as.matrix(train_x), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test_x), label = test$log_order_volume)
# -------- TRAINING PARAMETERS --------
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 6,
eta = 0.1, # learning rate
subsample = 0.8,
colsample_bytree = 0.8
)
# -------- TRAIN MODEL --------
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 300,
early_stopping_rounds = 10,
watchlist = list(train = dtrain),
verbose = 0
)
# -------- PREDICT --------
xgb_pred_log <- predict(xgb_model, newdata = dtest)
xgb_pred <- expm1(xgb_pred_log) # inverse log1p
# -------- MANUAL EVALUATION --------
actual <- test$order_volume
pred <- xgb_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / actual)) * 100
cat("✅ XGBoost RMSE:", round(rmse, 2), "\n")
cat("✅ XGBoost MAPE:", round(mape, 2), "%\n")
# Make sure the predictors are present
predictors <- c("month", "year", "cod_ratio", "sale_month") # Adjust as needed
train_x <- train[, predictors]
train_y <- train$log_order_volume
# Convert to matrix
dtrain <- xgb.DMatrix(data = as.matrix(train_x), label = train_y)
params <- list(
objective = "reg:squarederror",
eta = 0.1, # Learning rate (can try 0.01, 0.05 later)
max_depth = 6, # Try 4–10
subsample = 0.8, # Row sampling
colsample_bytree = 0.8, # Feature sampling
eval_metric = "rmse"
)
# Cross-validation
set.seed(123)
xgb_cv <- xgb.cv(
params = params,
data = dtrain,
nrounds = 500,
nfold = 5,
early_stopping_rounds = 10,
print_every_n = 10,
verbose = 1
)
best_nrounds <- xgb_cv$best_iteration
cat("Best nrounds:", best_nrounds, "\n")
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Prepare test set
test_x <- test[, predictors]
dtest <- xgb.DMatrix(data = as.matrix(test_x))
# Predict
xgb_pred_log <- predict(xgb_model, newdata = dtest)
xgb_pred <- expm1(xgb_pred_log) # Inverse of log1p
# Manual Evaluation
actual <- test$order_volume
pred <- xgb_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / actual)) * 100
cat("✅ Tuned XGBoost RMSE:", round(rmse, 2), "\n")
cat("✅ Tuned XGBoost MAPE:", round(mape, 2), "%\n")
colnames(train)
library(rpart.plot)
# Visualize the tree
rpart.plot(dt_model,
type = 2, # Label all nodes
extra = 101, # Show fitted value and % of observations
fallen.leaves = TRUE,
cex = 0.7, # Font size
main = "Decision Tree: Predicting Log Order Volume")
library(dplyr)
library(lubridate)
# Ensure created_at is Date type
df$created_at <- as.Date(df$created_at)
# Aggregate by date
daily_data <- df %>%
group_by(created_at) %>%
summarise(
order_volume = n(), # Or sum(qty_ordered) if appropriate
cod_ratio = mean(payment_method == "COD", na.rm = TRUE), # Adjust if needed
.groups = "drop"
) %>%
mutate(
day = day(created_at),
week = week(created_at),
month = month(created_at),
year = year(created_at),
sale_day = ifelse(month(created_at) %in% c(11, 12), 1, 0), # Adjust for sale logic
log_order_volume = log1p(order_volume)
)
weekly_data <- df %>%
mutate(week_start = floor_date(as.Date(created_at), "week")) %>%
group_by(week_start) %>%
summarise(
order_volume = n(),
cod_ratio = mean(payment_method == "COD", na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
week = week(week_start),
month = month(week_start),
year = year(week_start),
sale_week = ifelse(month %in% c(11, 12), 1, 0),
log_order_volume = log1p(order_volume)
)
#daily data
n_daily <- nrow(daily_data)
split_daily <- floor(0.8 * n_daily)
train_daily <- daily_data[1:split_daily, ]
test_daily <- daily_data[(split_daily + 1):n_daily, ]
#Weekly Data
n_weekly <- nrow(weekly_data)
split_weekly <- floor(0.8 * n_weekly)
train_weekly <- weekly_data[1:split_weekly, ]
test_weekly <- weekly_data[(split_weekly + 1):n_weekly, ]
library(rpart)
dt_daily <- rpart(log_order_volume ~ day + month + year + sale_day + cod_ratio,
data = train_daily, method = "anova")
pred_log_daily <- predict(dt_daily, newdata = test_daily)
pred_daily <- expm1(pred_log_daily)
rmse_daily <- sqrt(mean((test_daily$order_volume - pred_daily)^2))
mape_daily <- mean(abs(test_daily$order_volume - pred_daily) / test_daily$order_volume) * 100
cat("Daily RMSE:", round(rmse_daily, 2), "\n")
cat("Daily MAPE:", round(mape_daily, 2), "%\n")
dt_weekly <- rpart(log_order_volume ~ week + month + year + sale_week + cod_ratio,
data = train_weekly, method = "anova")
pred_log_weekly <- predict(dt_weekly, newdata = test_weekly)
pred_weekly <- expm1(pred_log_weekly)
rmse_weekly <- sqrt(mean((test_weekly$order_volume - pred_weekly)^2))
mape_weekly <- mean(abs(test_weekly$order_volume - pred_weekly) / test_weekly$order_volume) * 100
cat("Weekly RMSE:", round(rmse_weekly, 2), "\n")
cat("Weekly MAPE:", round(mape_weekly, 2), "%\n")
library(rpart.plot)
# Visualize Daily Tree
rpart.plot(dt_daily, main = "Daily-Level Decision Tree")
# Visualize Weekly Tree
rpart.plot(dt_weekly, main = "Weekly-Level Decision Tree")
sum(is.na(train_daily))
sum(is.na(test_daily))
train_daily <- na.omit(train_daily)
test_daily <- na.omit(test_daily)
library(randomForest)
rf_daily <- randomForest(
log_order_volume ~ day + month + year + sale_day + cod_ratio,
data = train_daily,
ntree = 100
)
pred_rf_log <- predict(rf_daily, newdata = test_daily)
pred_rf <- expm1(pred_rf_log)
actual <- test_daily$order_volume
# Handle MAPE safely
mape_rf <- mean(abs(actual - pred_rf) / ifelse(actual == 0, 1, actual)) * 100
rmse_rf <- sqrt(mean((actual - pred_rf)^2))
cat("Random Forest Daily RMSE:", round(rmse_rf, 2), "\n")
cat("Random Forest Daily MAPE:", round(mape_rf, 2), "%\n")
library(xgboost)
library(dplyr)
# Assume 'daily_data' is your data frame
daily_data <- daily_data %>%
select(order_volume, log_order_volume, cod_ratio, day, week, month, year, sale_day) %>%
na.omit()
# Split into train and test
set.seed(123)
train_index <- sample(nrow(daily_data), 0.8 * nrow(daily_data))
train <- daily_data[train_index, ]
test <- daily_data[-train_index, ]
# Prepare matrix for XGBoost
train_matrix <- as.matrix(train %>% select(-order_volume, -log_order_volume))
test_matrix <- as.matrix(test %>% select(-order_volume, -log_order_volume))
dtrain <- xgb.DMatrix(data = train_matrix, label = train$log_order_volume)
dtest <- xgb.DMatrix(data = test_matrix, label = test$log_order_volume)
#tarin xgboost
xgb_model <- xgboost(
data = dtrain,
objective = "reg:squarederror",
nrounds = 100,
eta = 0.1,
max_depth = 6,
verbose = 0
)
pred_log <- predict(xgb_model, dtest)
pred <- expm1(pred_log) # convert back from log scale
importance <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance)
predictors <- c("day", "week", "month", "year", "sale_day", "cod_ratio")
predictors <- c("week", "month", "year", "sale_week", "cod_ratio") #For weekly data:
names(daily_data)
names(weekly_data)
set.seed(123)
n <- nrow(daily_data)
train_index <- 1:round(0.8 * n)
train <- daily_data[train_index, ]
test <- daily_data[-train_index, ]
predictors <- c("cod_ratio", "day", "week", "month", "year", "sale_day")
library(xgboost)
dtrain <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test[, predictors]))
xgb_model <- xgboost(
data = dtrain,
objective = "reg:squarederror",
nrounds = 100,
verbose = 0
)
pred_log <- predict(xgb_model, dtest)
pred <- expm1(pred_log)
actual <- test$order_volume
# Align lengths
valid <- !is.na(actual) & !is.na(pred)
actual_valid <- actual[valid]
pred_valid <- pred[valid]
rmse <- sqrt(mean((actual_valid - pred_valid)^2))
mape <- mean(abs((actual_valid - pred_valid) / actual_valid)) * 100
cat("📈 XGBoost Daily RMSE:", round(rmse, 2), "\n")
cat("📉 XGBoost Daily MAPE:", round(mape, 2), "%\n")
set.seed(123)
n <- nrow(weekly_data)
train_index <- 1:round(0.8 * n)
train <- weekly_data[train_index, ]
test <- weekly_data[-train_index, ]
predictors <- c("cod_ratio", "week", "month", "year", "sale_week")
library(xgboost)
dtrain <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test[, predictors]))
xgb_model <- xgboost(
data = dtrain,
objective = "reg:squarederror",
nrounds = 100,
verbose = 0
)
# Predict on test set (log scale)
pred_log <- predict(xgb_model, dtest)
# Transform back to original scale
pred <- expm1(pred_log)
actual <- test$order_volume
# Clean NAs and align lengths
valid <- !is.na(actual) & !is.na(pred)
actual_valid <- actual[valid]
pred_valid <- pred[valid]
# Manual RMSE
rmse <- sqrt(mean((actual_valid - pred_valid)^2))
# Manual MAPE
mape <- mean(abs((actual_valid - pred_valid) / actual_valid)) * 100
# Print results
cat("📈 XGBoost Weekly RMSE:", round(rmse, 2), "\n")
cat("📉 XGBoost Weekly MAPE:", round(mape, 2), "%\n")
✅ XGBoost RMSE: 4.69
✅ XGBoost MAPE: 20.56 %
[1] train-rmse:0.348996+0.000435 test-rmse:0.348994+0.001425
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.
[11] train-rmse:0.251811+0.000783 test-rmse:0.251826+0.001620
[21] train-rmse:0.236500+0.000576 test-rmse:0.236503+0.001683
[31] train-rmse:0.234372+0.000530 test-rmse:0.234381+0.001754
[41] train-rmse:0.234028+0.000495 test-rmse:0.234037+0.001801
[51] train-rmse:0.233951+0.000473 test-rmse:0.233959+0.001830
[61] train-rmse:0.233930+0.000465 test-rmse:0.233938+0.001841
[71] train-rmse:0.233925+0.000463 test-rmse:0.233933+0.001845
[81] train-rmse:0.233923+0.000463 test-rmse:0.233931+0.001847
[91] train-rmse:0.233922+0.000463 test-rmse:0.233930+0.001847
[101] train-rmse:0.233922+0.000463 test-rmse:0.233929+0.001847
[111] train-rmse:0.233921+0.000463 test-rmse:0.233929+0.001846
[121] train-rmse:0.233921+0.000463 test-rmse:0.233929+0.001847
[131] train-rmse:0.233921+0.000463 test-rmse:0.233929+0.001847
Stopping. Best iteration:
[126] train-rmse:0.233921+0.000463 test-rmse:0.233929+0.001847
Best nrounds: 126
✅ Tuned XGBoost RMSE: 4.69
✅ Tuned XGBoost MAPE: 20.57 %
Daily RMSE: 765.47
Daily MAPE: 129.33 %
Weekly RMSE: 3483.02
Weekly MAPE: 99.53 %
0
0
Random Forest Daily RMSE: 731.77
Random Forest Daily MAPE: 60.78 %
📈 XGBoost Daily RMSE: 840.97
📉 XGBoost Daily MAPE: 97.58 %
📈 XGBoost Weekly RMSE: 3974.95
📉 XGBoost Weekly MAPE: 98.02 %
# Conclusion
# By using classification models to determine which customer and SKUs have high potential, businesses can focus their effort on higher potential segments to make marketing cost more efficient and have better return on investment (ROI).
# By using regression model to do forecasting, businesses can set company’s target accordingly and use relevant features such as discounts to boost sales.