Pakistan’s Largest E-Commerce Dataset

# Load necessary packages
library(readxl)
library(dplyr)
library(lubridate)
library(stringr)

# Read the Excel file
df <- read_excel("C:/Users/A/Downloads/Pakistan Largest Ecommerce Dataset Updated.xlsx")

# Assign df to raw_data
raw_data <- df

# Remove completely blank rows based on item_id
raw_data <- raw_data %>%
  filter(!is.na(item_id))

cat("Row count after removing blanks:", nrow(raw_data), "\n")

# Remove redundant columns
raw_data <- raw_data %>%
  select(-grand_total, -sales_commission_code, -`Working Date`)

# Count invalid values
invalid_values <- c("#REF!", "\\N", "", "#N/A")

count_invalids <- function(df) {
  sapply(df, function(col) {
    sum(
      is.na(col) | trimws(as.character(col)) %in% invalid_values
    )
  })
}

invalid_counts <- count_invalids(raw_data)
cat("Invalid row counts per column:\n")
print(invalid_counts)

# Remove invalid rows (excluding category_name_1)
columns_to_check <- setdiff(names(raw_data), "category_name_1")
invalid_row_mask <- apply(raw_data[columns_to_check], 1, function(row) {
  any(
    is.na(row) | trimws(as.character(row)) %in% invalid_values
  )
})
clean_data <- raw_data[!invalid_row_mask, ]
cat("Row count after removing invalid rows:", nrow(clean_data), "\n")
print(count(clean_data))

# Tag rows for category imputation
clean_data <- clean_data %>%
  mutate(
    category_status = ifelse(
      trimws(as.character(category_name_1)) %in% c("", "\\N"),
      "Need to impute",
      "Have category"
    )
  )

# SKU cleaning
clean_data$sku <- trimws(clean_data$sku)

clean_data <- clean_data %>%
  mutate(
    sku_prefix = sapply(sku, function(x) {
      x <- as.character(x)
      if (is.na(x)) return(NA)
      parts <- unlist(strsplit(x, "[-_ ]"))
      if (length(parts) == 0) return(NA)
      if (nchar(parts[1]) == 1 && length(parts) > 1) {
        return(paste0(parts[1], "_", parts[2]))
      } else {
        return(parts[1])
      }
    })
  )

# Create sku_prefix_2
clean_data <- clean_data %>%
  mutate(
    sku_prefix_2 = ifelse(
      nchar(sku_prefix) >= 16 & nchar(sku_prefix) <= 19,
      substr(sku_prefix, 1, 3),
      tolower(sku_prefix)
    )
  )

# Create mapping table for imputation
prefix_category_table <- clean_data %>%
  filter(category_status == "Have category") %>%
  group_by(sku_prefix_2, category_name_1) %>%
  summarise(distinct_sku_count = n_distinct(sku), .groups = "drop")

final_prefix_category_table <- prefix_category_table %>%
  group_by(sku_prefix_2) %>%
  slice_max(order_by = distinct_sku_count, n = 1, with_ties = FALSE) %>%
  ungroup()

# Impute category_name_1
clean_data <- clean_data %>%
  left_join(final_prefix_category_table %>% select(sku_prefix_2, imputed_category = category_name_1),
            by = "sku_prefix_2") %>%
  mutate(
    category_final = ifelse(
      category_status == "Have category",
      category_name_1,
      imputed_category
    )
  )

# Final cleaning
columns_to_check <- setdiff(names(clean_data), "category_name_1")
invalid_row_mask <- apply(clean_data[columns_to_check], 1, function(row) {
  any(
    is.na(row) | trimws(as.character(row)) %in% invalid_values
  )
})
clean_data <- clean_data[!invalid_row_mask, ]

cat("Final row count after full cleaning:", nrow(clean_data), "\n")
print(count(clean_data))

# Final invalid check
invalid_counts <- count_invalids(clean_data)
cat("Final invalid counts:\n")
print(invalid_counts)

# View the cleaned data
print(clean_data)

Row count after removing blanks: 584524 
Invalid row counts per column:
        item_id          status      created_at             sku           price 
              0              19               0              20               0 
    qty_ordered category_name_1 discount_amount  payment_method       BI Status 
              0            8014               0               0               1 
           Year           Month  Customer Since             M-Y              FY 
              0               0              11               0               0 
    Customer ID 
             11 
Row count after removing invalid rows: 584473 
[90m# A tibble: 1 × 1[39m
       n
   [3m[90m<int>[39m[23m
[90m1[39m [4m5[24m[4m8[24m[4m4[24m473
Final row count after full cleaning: 583996 
[90m# A tibble: 1 × 1[39m
       n
   [3m[90m<int>[39m[23m
[90m1[39m [4m5[24m[4m8[24m[4m3[24m996
Final invalid counts:
         item_id           status       created_at              sku 
               0                0                0                0 
           price      qty_ordered  category_name_1  discount_amount 
               0                0             7516                0 
  payment_method        BI Status             Year            Month 
               0                0                0                0 
  Customer Since              M-Y               FY      Customer ID 
               0                0                0                0 
 category_status       sku_prefix     sku_prefix_2 imputed_category 
               0                0                0                0 
  category_final 
               0 
[90m# A tibble: 583,996 × 21[39m
   item_id status    created_at          sku   price qty_ordered category_name_1
     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m     [3m[90m<dttm>[39m[23m              [3m[90m<chr>[39m[23m [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m          
[90m 1[39m  [4m2[24m[4m1[24m[4m1[24m133 canceled  2016-07-01 [90m00:00:00[39m [90m"[39mkcc…   240           1 Beauty & Groom…
[90m 2[39m  [4m2[24m[4m1[24m[4m1[24m134 canceled  2016-07-01 [90m00:00:00[39m [90m"[39mEgo…  [4m2[24m450           1 Women's Fashion
[90m 3[39m  [4m2[24m[4m1[24m[4m1[24m135 complete  2016-07-01 [90m00:00:00[39m [90m"[39mkcc…   360           1 Beauty & Groom…
[90m 4[39m  [4m2[24m[4m1[24m[4m1[24m136 order_re… 2016-07-01 [90m00:00:00[39m [90m"[39mBK7…   555           2 Soghaat        
[90m 5[39m  [4m2[24m[4m1[24m[4m1[24m137 canceled  2016-07-01 [90m00:00:00[39m [90m"[39mUK_…    80           1 Soghaat        
[90m 6[39m  [4m2[24m[4m1[24m[4m1[24m138 complete  2016-07-01 [90m00:00:00[39m [90m"[39mkcc…   360           1 Beauty & Groom…
[90m 7[39m  [4m2[24m[4m1[24m[4m1[24m139 complete  2016-07-01 [90m00:00:00[39m [90m"[39mUK_…   170           1 Soghaat        
[90m 8[39m  [4m2[24m[4m1[24m[4m1[24m140 canceled  2016-07-01 [90m00:00:00[39m [90m"[39mApp… [4m9[24m[4m6[24m499           1 Mobiles & Tabl…
[90m 9[39m  [4m2[24m[4m1[24m[4m1[24m141 canceled  2016-07-01 [90m00:00:00[39m [90m"[39mApp… [4m9[24m[4m6[24m499           1 Mobiles & Tabl…
[90m10[39m  [4m2[24m[4m1[24m[4m1[24m142 complete  2016-07-01 [90m00:00:00[39m [90m"[39mGFC…  [4m5[24m500           1 Appliances     
[90m# ℹ 583,986 more rows[39m
[90m# ℹ 14 more variables: discount_amount <dbl>, payment_method <chr>,[39m
[90m#   `BI Status` <chr>, Year <dbl>, Month <dbl>, `Customer Since` <chr>,[39m
[90m#   `M-Y` <dttm>, FY <chr>, `Customer ID` <dbl>, category_status <chr>,[39m
[90m#   sku_prefix <chr>, sku_prefix_2 <chr>, imputed_category <chr>,[39m
[90m#   category_final <chr>[39m

library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(scales)
library(tidyr)
library(ggcorrplot)

# Load Excel file
file_path <- "C:\\Users\\A\\Downloads\\Pakistan Largest Ecommerce Dataset Updated.xlsx"
df <- read_excel(file_path, sheet = "Scrubbed Data")

# Change date format
df_raw <- read_excel(file_path, sheet = "Scrubbed Data", col_types = "text")
df_raw$created_at <- as.Date(as.numeric(df_raw$created_at), origin = "1899-12-30")

df <- df_raw %>%
  mutate(
    `M-Y` = format(created_at, "%Y-%m"),
    day_of_week = weekdays(created_at)
  )

cat("✅ Dataset Overview:\n")
cat("Total Rows:", nrow(df), "\n")
cat("Total Columns:", ncol(df), "\n")
cat("Unique SKUs:", n_distinct(df$sku), "\n")
cat("Unique Customers:", n_distinct(df$`Customer ID`), "\n")
cat("Date Range:", as.character(min(df$created_at, na.rm = TRUE)), 
    "to", as.character(max(df$created_at, na.rm = TRUE)), "\n")

best_selling <- df %>%
  group_by(category_name_1) %>%
  summarise(item_count = n()) %>%
  arrange(item_count)

# Plot: Best Selling Categories
ggplot(best_selling, aes(x = item_count, y = reorder(category_name_1, item_count))) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = paste0(round(item_count / 1000), "k")), hjust = -0.1) +
  labs(
    title = "Amount Transaction Product Category",
    x = "Transactions",
    y = "Category Product"
  ) +
  scale_x_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
  theme_minimal()

# Recalculate day_of_week if not already added
if (!"day_of_week" %in% colnames(df)) {
  df$day_of_week <- weekdays(df$created_at)
}

# Count orders by day
daywise_total <- df %>%
  filter(!is.na(day_of_week)) %>%
  count(day_of_week)

# Plot
ggplot(daywise_total, aes(x = reorder(day_of_week, -n), y = n)) +
  geom_col(fill = "coral") +
  labs(title = "Orders by Day of the Week", x = "Day", y = "Order Count") +
  theme_minimal()

# Ensure columns are numeric
df$qty_ordered <- as.numeric(df$qty_ordered)

# Extract Year and Month
df <- df %>%
  mutate(
    Year = year(created_at),
    Month = month(created_at)
  )

# Group by Year & Month
monthly_by_year <- df %>%
  filter(!is.na(created_at)) %>%
  group_by(Year, Month) %>%
  summarise(total_qty = sum(qty_ordered, na.rm = TRUE), .groups = "drop")

# Line plot: one line per year
ggplot(monthly_by_year, aes(x = Month, y = total_qty, color = factor(Year))) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = 1:12) +
  labs(title = "Amount Transactions Every Month",
       x = "Month",
       y = "Transactions",
       color = "Year") +
  theme_minimal()

# Ensure columns are numeric
df$discount_amount <- as.numeric(df$discount_amount)
df$created_at <- as.POSIXct(df$created_at)  # if not already
df$item_id <- as.character(df$sku)  # assuming 'sku' is item_id

# Extract Year and Month
df <- df %>%
  mutate(Year = year(created_at),
         Month = month(created_at))

# Count non-zero discount transactions per Year-Month
discount_df <- df %>%
  filter(discount_amount != 0) %>%
  group_by(Year, Month) %>%
  summarise(count_discount = n(), .groups = "drop")

# Count all transactions per Year-Month
transaction_df <- df %>%
  group_by(Year, Month) %>%
  summarise(transactions = n(), .groups = "drop")

# Merge discount into transaction summary
merged_df <- left_join(transaction_df, discount_df, by = c("Year", "Month")) %>%
  mutate(count_discount = replace_na(count_discount, 0))

# Correlation matrix
correlation_matrix <- merged_df %>%
  select(Year, Month, transactions, count_discount) %>%
  cor(use = "complete.obs")

# Monthly totals for plotting
monthly_plot_df <- merged_df %>%
  group_by(Month) %>%
  summarise(Transactions = sum(transactions),
            Discount = sum(count_discount), .groups = "drop") %>%
  pivot_longer(cols = c(Transactions, Discount),
               names_to = "Metric", values_to = "Count")

# Correlation heatmap
ggcorrplot(correlation_matrix,
           lab = TRUE,
           method = "square",
           title = "Heatmap Correlation")

# Line plot of monthly totals
ggplot(monthly_plot_df, aes(x = Month, y = Count, color = Metric, group = Metric)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = 1:12) +
  scale_y_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
  labs(title = "Monthly Transactions & Discounts",
       x = "Month", y = "Transaction Count") +
  scale_color_manual(values = c("Transactions" = "red", "Discount" = "blue")) +
  theme_minimal()

payment_method <- df %>%
  group_by(payment_method) %>%
  summarise(item_count = n()) %>%
  arrange(item_count)

# Visualization
ggplot(payment_method, aes(x = item_count, y = reorder(payment_method, item_count))) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = paste0(round(item_count / 1000), "k")), hjust = -0.1) +
  labs(
    title = "Amount Payment Method Used",
    x = "Transaction",
    y = "Payment Method"
  ) +
  scale_x_continuous(labels = label_number(scale = 1/1000, suffix = "k")) +
  theme_minimal()

✅ Dataset Overview:
Total Rows: 584524 
Total Columns: 20 
Unique SKUs: 84800 
Unique Customers: 115327 
Date Range: 2016-07-01 to 2018-08-28

png

png

png

png

png

png

# ----------------------
# Load Required Libraries
# ----------------------
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(caret)
library(randomForest)
library(xgboost)
library(ggplot2)

# ----------------------
# Load and Prepare Data
# ----------------------
file_path <- "C:/Users/A/Downloads/Pakistan Largest Ecommerce Dataset Updated.xlsx"
df <- read_excel(file_path, sheet = "Scrubbed Data")

# Rename relevant columns (match Excel file exactly)
df <- df %>%
  rename(
    customer_id = `Customer ID`,
    category_final = `category_name_1`
  )

# Convert necessary columns
df$customer_id <- as.character(df$customer_id)
df$created_at <- as.Date(df$created_at)

# ----------------------
# Customer Summary
# ----------------------
customer_summary <- df %>%
  filter(status %in% c("complete", "paid", "received")) %>%
  group_by(customer_id) %>%
  summarise(
    total_orders = n(),
    total_quantity = sum(qty_ordered, na.rm = TRUE),
    total_spent = sum(price * qty_ordered, na.rm = TRUE),
    avg_order_value = total_spent / total_orders,
    unique_categories = n_distinct(category_final),
    most_frequent_category = {
      valid_cats <- category_final[!is.na(category_final)]
      if (length(valid_cats) == 0) NA_character_ else names(which.max(table(valid_cats)))
    }
  )

# ----------------------
# Segment Based on Spending
# ----------------------
p33 <- quantile(customer_summary$total_spent, 0.33, na.rm = TRUE)
p66 <- quantile(customer_summary$total_spent, 0.66, na.rm = TRUE)

customer_summary <- customer_summary %>%
  mutate(
    segment = case_when(
      total_spent <= p33 ~ "Low",
      total_spent <= p66 ~ "Medium",
      TRUE ~ "High"
    )
  )

# View count of each segment
print(table(customer_summary$segment))

# Segment-wise summary stats
segment_stats <- customer_summary %>%
  group_by(segment) %>%
  summarise(
    n_customers = n(),
    avg_spent = mean(total_spent),
    avg_order_value = mean(avg_order_value),
    avg_quantity = mean(total_quantity)
  )
print(segment_stats)

# ----------------------
# First 3 Months Analysis
# ----------------------
first_order_dates <- df %>%
  group_by(customer_id) %>%
  summarise(first_order = min(created_at))

df <- df %>%
  left_join(first_order_dates, by = "customer_id") %>%
  mutate(
    months_since_first = interval(first_order, created_at) %/% months(1)
  )

early_data <- df %>%
  filter(months_since_first <= 3 & status %in% c("complete", "paid", "received"))

early_summary <- early_data %>%
  group_by(customer_id) %>%
  summarise(
    total_orders = n(),
    total_quantity = sum(qty_ordered, na.rm = TRUE),
    total_spent = sum(price * qty_ordered, na.rm = TRUE),
    avg_order_value = total_spent / total_orders,
    unique_categories = n_distinct(category_final)
  )

# ----------------------
# Prepare Model Data
# ----------------------
model_data <- early_summary %>%
  left_join(customer_summary %>% select(customer_id, segment), by = "customer_id") %>%
  filter(!is.na(segment))

model_data$segment <- as.factor(model_data$segment)
model_data <- model_data %>% select(-customer_id)

# ----------------------
# Split Data
# ----------------------
set.seed(123)
train_index <- createDataPartition(model_data$segment, p = 0.8, list = FALSE)
train <- model_data[train_index, ]
test <- model_data[-train_index, ]

# ----------------------
# Train XGBoost Classifier
# ----------------------
grid <- expand.grid(
  nrounds = 20,
  max_depth = 2,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 0.8,
  min_child_weight = 3,
  subsample = 0.8
)

control <- trainControl(method = "cv", number = 3)

system.time({
  model <- train(
    segment ~ ., 
    data = train, 
    method = "xgbTree",
    trControl = control,
    tuneGrid = grid
  )
})

# ----------------------
# Evaluate Model
# ----------------------
pred <- predict(model, newdata = test)
conf_matrix <- confusionMatrix(pred, test$segment)
print(conf_matrix)

# Variable Importance
print(varImp(model))

  High    Low Medium 
 27088  26301  26282 
[90m# A tibble: 3 × 5[39m
  segment n_customers avg_spent avg_order_value avg_quantity
  [3m[90m<chr>[39m[23m         [3m[90m<int>[39m[23m     [3m[90m<dbl>[39m[23m           [3m[90m<dbl>[39m[23m        [3m[90m<dbl>[39m[23m
[90m1[39m High          [4m2[24m[4m7[24m088    [4m4[24m[4m7[24m203.          [4m1[24m[4m3[24m468.         9.90
[90m2[39m Low           [4m2[24m[4m6[24m301      705.            585.         1.50
[90m3[39m Medium        [4m2[24m[4m6[24m282     [4m2[24m740.           [4m1[24m602.         3.18



   user  system elapsed 
   3.22    1.78    1.81 


Confusion Matrix and Statistics

          Reference
Prediction High  Low Medium
    High   4821    0      2
    Low     113 5182    245
    Medium  358    0   4908

Overall Statistics
                                          
               Accuracy : 0.9541          
                 95% CI : (0.9507, 0.9573)
    No Information Rate : 0.3386          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9311          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: High Class: Low Class: Medium
Sensitivity               0.9110     1.0000        0.9521
Specificity               0.9998     0.9657        0.9658
Pos Pred Value            0.9996     0.9354        0.9320
Neg Pred Value            0.9564     1.0000        0.9762
Prevalence                0.3386     0.3316        0.3298
Detection Rate            0.3085     0.3316        0.3140
Detection Prevalence      0.3086     0.3545        0.3369
Balanced Accuracy         0.9554     0.9829        0.9590
xgbTree variable importance

                   Overall
total_spent       100.0000
avg_order_value    12.8109
total_quantity      0.9547
total_orders        0.8830
unique_categories   0.0000

# ----------------------
# Libraries for Visuals and Classification
# ----------------------
library(ggplot2)
library(tidyr)
library(dplyr)
library(caret)
library(randomForest)
library(lubridate)

# ----------------------
# Classification Metrics Plot (Customer)
# ----------------------
class_metrics <- data.frame(
  Class = c("High", "Low", "Medium"),
  Sensitivity = c(0.9696, 1.0000, 0.9587),
  Specificity = c(1.0000, 0.9721, 0.9929),
  Precision   = c(1.0000, 0.9664, 0.9844),
  Balanced_Accuracy = c(0.9848, 0.9860, 0.9758)
)

class_metrics_long <- class_metrics %>%
  pivot_longer(cols = -Class, names_to = "Metric", values_to = "Value")

ggplot(class_metrics_long, aes(x = Class, y = Value, fill = Metric)) +
  geom_col(position = position_dodge()) +
  geom_text(aes(label = sprintf("%.2f", Value)),
            position = position_dodge(width = 0.9), vjust = -0.3, size = 3) +
  scale_fill_brewer(palette = "Set2") +
  ylim(0, 1.1) +
  labs(title = "Classification Metrics by Segment",
       y = "Score",
       x = "Customer Segment",
       fill = "Metric") +
  theme_minimal(base_size = 13)

# ----------------------
# Confusion Matrix Plot (Customer)
# ----------------------
# Assumes `pred` and `test$segment` already exist from previous model
cm <- confusionMatrix(pred, test$segment)

cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Actual", "Predicted", "Freq")

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), vjust = 0.5, size = 5) +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Confusion Matrix (Customer Segment Prediction)",
       x = "Actual Segment", y = "Predicted Segment") +
  theme_minimal(base_size = 13)

# ----------------------
# SKU SEGMENTATION CLASSIFICATION
# ----------------------

# Parse created_at safely
df$created_at <- parse_date_time(df$created_at, orders = c("ymd", "dmy", "mdy", "Ymd HMS", "ymd HMS"))
df <- df %>% filter(!is.na(created_at))  # remove rows with invalid dates

# SKU-level summary
item_summary <- df %>%
  filter(status %in% c("complete", "paid", "received")) %>%
  group_by(sku) %>%
  summarise(
    total_orders = n(),
    total_quantity = sum(qty_ordered, na.rm = TRUE),
    total_revenue = sum(price * qty_ordered, na.rm = TRUE),
    avg_price = mean(price, na.rm = TRUE),
    unique_buyers = n_distinct(customer_id),
    num_days_sold = n_distinct(as.Date(created_at)),
    category = first(na.omit(category_final)),
    .groups = "drop"
  )

# Revenue-based segmentation
p33 <- quantile(item_summary$total_revenue, 0.33, na.rm = TRUE)
p66 <- quantile(item_summary$total_revenue, 0.66, na.rm = TRUE)

item_summary <- item_summary %>%
  mutate(
    segment = case_when(
      total_revenue <= p33 ~ "Low",
      total_revenue <= p66 ~ "Medium",
      TRUE ~ "High"
    ),
    segment = factor(segment, levels = c("Low", "Medium", "High"))
  )

print(table(item_summary$segment))

# Prepare modeling data
model_item_data <- item_summary %>%
  select(-sku, -category)

# Train/test split
set.seed(123)
train_index <- createDataPartition(model_item_data$segment, p = 0.8, list = FALSE)
train_item <- model_item_data[train_index, ]
test_item <- model_item_data[-train_index, ]

# Train Random Forest
item_model <- randomForest(segment ~ ., data = train_item, ntree = 100)

# Predict & evaluate
pred_item <- predict(item_model, newdata = test_item)
conf_matrix_item <- confusionMatrix(pred_item, test_item$segment)
print(conf_matrix_item)

# Feature importance plot
varImpPlot(item_model, main = "Feature Importance - Item Segmentation")

# Bar plot of item segments
ggplot(item_summary, aes(x = segment, fill = segment)) +
  geom_bar() +
  labs(title = "Item Segment Distribution",
       x = "Item Segment", y = "Number of Items") +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal()

png

   Low Medium   High 
 20851  20858  21351 
Confusion Matrix and Statistics

          Reference
Prediction  Low Medium High
    Low    4170      0    0
    Medium    0   4171    0
    High      0      0 4270

Overall Statistics
                                     
               Accuracy : 1          
                 95% CI : (0.9997, 1)
    No Information Rate : 0.3386     
    P-Value [Acc > NIR] : < 2.2e-16  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         

Statistics by Class:

                     Class: Low Class: Medium Class: High
Sensitivity              1.0000        1.0000      1.0000
Specificity              1.0000        1.0000      1.0000
Pos Pred Value           1.0000        1.0000      1.0000
Neg Pred Value           1.0000        1.0000      1.0000
Prevalence               0.3307        0.3307      0.3386
Detection Rate           0.3307        0.3307      0.3386
Detection Prevalence     0.3307        0.3307      0.3386
Balanced Accuracy        1.0000        1.0000      1.0000

png

png

png

# ------------------------
# Load Required Libraries
# ------------------------
library(readxl)
library(dplyr)
library(lubridate)
library(tidyr)
library(rpart)
library(rpart.plot)
library(tidyverse)  # includes ggplot2, dplyr, readr, etc.

# Convert date column
df$created_at <- as.Date(df$created_at, format = "%m/%d/%Y")
head(df$created_at)

# ------------------------
# Feature Engineering
# ------------------------
df <- df %>%
  mutate(
    month = month(created_at),
    year = year(created_at),
    month_year = floor_date(created_at, "month"),
    is_sale_month = ifelse(month %in% c(11, 12), 1, 0),  # Nov/Dec sale period
    is_cod = ifelse(payment_method == "cod", 1, 0)
  )

# ------------------------
# Top 3 Categories by Order Volume
# ------------------------
top_categories <- df %>%
  count(category_final, sort = TRUE) %>%
  top_n(3, n) %>%
  pull(category_final)

df_top <- df %>% filter(category_final %in% top_categories)

# ------------------------
# Aggregate Monthly Data
# ------------------------
monthly_data <- df_top %>%
  group_by(category_final, month_year, month, year) %>%
  summarise(
    order_volume = n(),
    sale_month = first(is_sale_month),
    cod_ratio = mean(is_cod, na.rm = TRUE),
    .groups = 'drop'
  )

# ------------------------
# Split Data into Train/Test
# ------------------------
set.seed(42)
train_index <- 1:round(0.7 * nrow(monthly_data))
train <- monthly_data[train_index, ]
test <- monthly_data[-train_index, ]

# ------------------------
# Decision Tree Model (Raw)
# ------------------------
tree_model <- rpart(
  order_volume ~ month + year + sale_month + cod_ratio + category_final,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0.01)
)

pred <- predict(tree_model, newdata = test)
actual <- test$order_volume

# MAPE (manual)
mape <- mean(abs((actual - pred) / actual)) * 100
rmse <- sqrt(mean((actual - pred)^2))

cat("MAPE:", round(mape, 2), "%\n")
cat("RMSE:", round(rmse, 2), "\n")

# ------------------------
# Clean Numeric Columns (optional, safety)
# ------------------------
train$order_volume <- as.numeric(train$order_volume)
test$order_volume <- as.numeric(test$order_volume)

# ------------------------
# Log Transform Target
# ------------------------
train$log_order_volume <- log1p(train$order_volume)
test$log_order_volume <- log1p(test$order_volume)

# ------------------------
# Evaluation Function
# ------------------------
evaluate_model <- function(actual, predicted, label) {
  valid <- which(!is.na(actual) & !is.na(predicted) & is.finite(actual) & actual != 0)
  if (length(valid) == 0) {
    cat("⚠️", label, "- No valid data to evaluate\n\n")
    return()
  }

  actual_clean <- actual[valid]
  predicted_clean <- predicted[valid]

  mape <- mean(abs((actual_clean - predicted_clean) / actual_clean)) * 100
  rmse <- sqrt(mean((actual_clean - predicted_clean)^2))

  cat(paste0("Model: ", label, "\n"))
  cat("MAPE: ", round(mape, 2), "%\n")
  cat("RMSE:", round(rmse, 2), "\n\n")
}

# ------------------------
# Decision Tree (Log Target)
# ------------------------
dt_model <- rpart(
  log_order_volume ~ month + year + sale_month + cod_ratio,
  data = train,
  method = "anova"
)
dt_pred_log <- predict(dt_model, newdata = test)
dt_pred <- expm1(dt_pred_log)

evaluate_model(test$order_volume, dt_pred, "Decision Tree")

# ------------------------
# Linear Regression (Log Target)
# ------------------------
lm_model <- lm(
  log_order_volume ~ month + year + sale_month + cod_ratio,
  data = train
)
lm_pred_log <- predict(lm_model, newdata = test)
lm_pred <- expm1(lm_pred_log)

evaluate_model(test$order_volume, lm_pred, "Linear Regression")

# ------------------------
# Optional: Plot Tree
# ------------------------
rpart.plot(dt_model, main = "Regression Tree for Order Volume")

2016-07-01
2016-07-01
2016-07-01
2016-07-01
2016-07-01
2016-07-01

MAPE: 107.06 %
RMSE: 2648.97 
Model: Decision Tree
MAPE:  76.53 %
RMSE: 1775.14 

Model: Linear Regression
MAPE:  62.07 %
RMSE: 1409.72

png

# -----------------------
# Load Required Libraries
# -----------------------
library(dplyr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(xgboost)

# -----------------------
# Decision Tree (Tuned)
# -----------------------
control_params <- rpart.control(
  cp = 0.001,
  maxdepth = 10,
  minsplit = 20,
  minbucket = 5
)

dt_model_tuned <- rpart(
  log_order_volume ~ month + year + sale_month + cod_ratio,
  data = train,
  method = "anova",
  control = control_params
)

# Predict & Evaluate
dt_pred_log <- predict(dt_model_tuned, newdata = test)
dt_pred <- expm1(dt_pred_log)

actual <- test$order_volume
pred <- dt_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100

cat("Tuned Decision Tree RMSE:", round(rmse, 2), "\n")
cat("Tuned Decision Tree MAPE:", round(mape, 2), "%\n")

# -----------------------
# Decision Tree (Pruned)
# -----------------------
best_cp <- dt_model_tuned$cptable[which.min(dt_model_tuned$cptable[,"xerror"]), "CP"]
dt_model_pruned <- prune(dt_model_tuned, cp = best_cp)

dt_pruned_pred_log <- predict(dt_model_pruned, newdata = test)
dt_pruned_pred <- expm1(dt_pruned_pred_log)

actual <- test$order_volume
pred <- dt_pruned_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100

cat("Pruned Tree RMSE:", round(rmse, 2), "\n")
cat("Pruned Tree MAPE:", round(mape, 2), "%\n")

# -----------------------
# Random Forest
# -----------------------
rf_model <- randomForest(
  log_order_volume ~ month + year + sale_month + cod_ratio,
  data = train,
  ntree = 500,
  mtry = 2,
  importance = TRUE
)

rf_pred_log <- predict(rf_model, newdata = test)
rf_pred <- expm1(rf_pred_log)

actual <- test$order_volume
pred <- rf_pred
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100

cat("Random Forest RMSE:", round(rmse, 2), "\n")
cat("Random Forest MAPE:", round(mape, 2), "%\n")

# -----------------------
# Decision Tree (with category_final)
# -----------------------
# Make sure category_final exists (e.g., from renamed 'category_name_1')
if ("category_final" %in% names(train)) {
  dt_model_cat <- rpart(
    log_order_volume ~ category_final + month + year + sale_month + cod_ratio,
    data = train,
    method = "anova"
  )

  dt_cat_pred_log <- predict(dt_model_cat, newdata = test)
  dt_cat_pred <- expm1(dt_cat_pred_log)

  actual <- test$order_volume
  pred <- dt_cat_pred
  rmse <- sqrt(mean((actual - pred)^2))
  mape <- mean(abs((actual - pred) / ifelse(actual == 0, 1, actual))) * 100

  cat("Tree w/ Category RMSE:", round(rmse, 2), "\n")
  cat("Tree w/ Category MAPE:", round(mape, 2), "%\n")
}

# -----------------------
# XGBoost
# -----------------------

# Ensure sale_month exists (add if not)
if (!"sale_month" %in% colnames(train)) {
  train$sale_month <- ifelse(train$month %in% c(11, 12), 1, 0)
  test$sale_month <- ifelse(test$month %in% c(11, 12), 1, 0)
}

predictors <- c("month", "year", "sale_month", "cod_ratio")

train_matrix <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
test_matrix <- xgb.DMatrix(data = as.matrix(test[, predictors]))

xgb_model <- xgboost(
  data = train_matrix,
  nrounds = 100,
  objective = "reg:squarederror",
  verbose = 0
)

xgb_pred_log <- predict(xgb_model, newdata = test_matrix)
xgb_pred <- expm1(xgb_pred_log)

actual <- test$order_volume
pred <- xgb_pred
rmse <- sqrt(mean((actual - pred)^2, na.rm = TRUE))
mape <- mean(abs((actual - pred) / actual), na.rm = TRUE) * 100

cat("XGBoost RMSE:", round(rmse, 2), "\n")
cat("XGBoost MAPE:", round(mape, 2), "%\n")

Tuned Decision Tree RMSE: 1775.14 
Tuned Decision Tree MAPE: 76.53 %
Pruned Tree RMSE: 1775.14 
Pruned Tree MAPE: 76.53 %
Random Forest RMSE: 1175.87 
Random Forest MAPE: 53.69 %
Tree w/ Category RMSE: 1787.89 
Tree w/ Category MAPE: 75.5 %
XGBoost RMSE: 2820.28 
XGBoost MAPE: 54.61 %

# Load libraries
library(xgboost)
library(Matrix)
library(dplyr)

# -------- PREPARE DATA --------
# Ensure predictors exist
predictors <- c("month", "year", "cod_ratio", "sale_month")  # or adapt based on your data
train_x <- train[, predictors]
test_x <- test[, predictors]

# Convert to matrix
dtrain <- xgb.DMatrix(data = as.matrix(train_x), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test_x), label = test$log_order_volume)

# -------- TRAINING PARAMETERS --------
params <- list(
  objective = "reg:squarederror",
  eval_metric = "rmse",
  max_depth = 6,
  eta = 0.1,                 # learning rate
  subsample = 0.8,
  colsample_bytree = 0.8
)

# -------- TRAIN MODEL --------
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 300,
  early_stopping_rounds = 10,
  watchlist = list(train = dtrain),
  verbose = 0
)

# -------- PREDICT --------
xgb_pred_log <- predict(xgb_model, newdata = dtest)
xgb_pred <- expm1(xgb_pred_log)  # inverse log1p

# -------- MANUAL EVALUATION --------
actual <- test$order_volume
pred <- xgb_pred

rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / actual)) * 100

cat("✅ XGBoost RMSE:", round(rmse, 2), "\n")
cat("✅ XGBoost MAPE:", round(mape, 2), "%\n")


# Make sure the predictors are present
predictors <- c("month", "year", "cod_ratio", "sale_month")  # Adjust as needed

train_x <- train[, predictors]
train_y <- train$log_order_volume

# Convert to matrix
dtrain <- xgb.DMatrix(data = as.matrix(train_x), label = train_y)
params <- list(
  objective = "reg:squarederror",
  eta = 0.1,                 # Learning rate (can try 0.01, 0.05 later)
  max_depth = 6,             # Try 4–10
  subsample = 0.8,           # Row sampling
  colsample_bytree = 0.8,    # Feature sampling
  eval_metric = "rmse"
)

# Cross-validation
set.seed(123)
xgb_cv <- xgb.cv(
  params = params,
  data = dtrain,
  nrounds = 500,
  nfold = 5,
  early_stopping_rounds = 10,
  print_every_n = 10,
  verbose = 1
)

best_nrounds <- xgb_cv$best_iteration
cat("Best nrounds:", best_nrounds, "\n")
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)


# Prepare test set
test_x <- test[, predictors]
dtest <- xgb.DMatrix(data = as.matrix(test_x))

# Predict
xgb_pred_log <- predict(xgb_model, newdata = dtest)
xgb_pred <- expm1(xgb_pred_log)  # Inverse of log1p

# Manual Evaluation
actual <- test$order_volume
pred <- xgb_pred

rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / actual)) * 100

cat("✅ Tuned XGBoost RMSE:", round(rmse, 2), "\n")
cat("✅ Tuned XGBoost MAPE:", round(mape, 2), "%\n")


colnames(train)


library(rpart.plot)

# Visualize the tree
rpart.plot(dt_model,
           type = 2,          # Label all nodes
           extra = 101,       # Show fitted value and % of observations
           fallen.leaves = TRUE,
           cex = 0.7,         # Font size
           main = "Decision Tree: Predicting Log Order Volume")

library(dplyr)
library(lubridate)

# Ensure created_at is Date type
df$created_at <- as.Date(df$created_at)

# Aggregate by date
daily_data <- df %>%
  group_by(created_at) %>%
  summarise(
    order_volume = n(),  # Or sum(qty_ordered) if appropriate
    cod_ratio = mean(payment_method == "COD", na.rm = TRUE),  # Adjust if needed
    .groups = "drop"
  ) %>%
  mutate(
    day = day(created_at),
    week = week(created_at),
    month = month(created_at),
    year = year(created_at),
    sale_day = ifelse(month(created_at) %in% c(11, 12), 1, 0),  # Adjust for sale logic
    log_order_volume = log1p(order_volume)
  )

weekly_data <- df %>%
  mutate(week_start = floor_date(as.Date(created_at), "week")) %>%
  group_by(week_start) %>%
  summarise(
    order_volume = n(),
    cod_ratio = mean(payment_method == "COD", na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    week = week(week_start),
    month = month(week_start),
    year = year(week_start),
    sale_week = ifelse(month %in% c(11, 12), 1, 0),
    log_order_volume = log1p(order_volume)
  )

#daily data

n_daily <- nrow(daily_data)
split_daily <- floor(0.8 * n_daily)

train_daily <- daily_data[1:split_daily, ]
test_daily <- daily_data[(split_daily + 1):n_daily, ]


#Weekly Data
n_weekly <- nrow(weekly_data)
split_weekly <- floor(0.8 * n_weekly)

train_weekly <- weekly_data[1:split_weekly, ]
test_weekly <- weekly_data[(split_weekly + 1):n_weekly, ]

library(rpart)

dt_daily <- rpart(log_order_volume ~ day + month + year + sale_day + cod_ratio,
                  data = train_daily, method = "anova")

pred_log_daily <- predict(dt_daily, newdata = test_daily)
pred_daily <- expm1(pred_log_daily)

rmse_daily <- sqrt(mean((test_daily$order_volume - pred_daily)^2))
mape_daily <- mean(abs(test_daily$order_volume - pred_daily) / test_daily$order_volume) * 100

cat("Daily RMSE:", round(rmse_daily, 2), "\n")
cat("Daily MAPE:", round(mape_daily, 2), "%\n")

dt_weekly <- rpart(log_order_volume ~ week + month + year + sale_week + cod_ratio,
                   data = train_weekly, method = "anova")

pred_log_weekly <- predict(dt_weekly, newdata = test_weekly)
pred_weekly <- expm1(pred_log_weekly)

rmse_weekly <- sqrt(mean((test_weekly$order_volume - pred_weekly)^2))
mape_weekly <- mean(abs(test_weekly$order_volume - pred_weekly) / test_weekly$order_volume) * 100

cat("Weekly RMSE:", round(rmse_weekly, 2), "\n")
cat("Weekly MAPE:", round(mape_weekly, 2), "%\n")


library(rpart.plot)

# Visualize Daily Tree
rpart.plot(dt_daily, main = "Daily-Level Decision Tree")

# Visualize Weekly Tree
rpart.plot(dt_weekly, main = "Weekly-Level Decision Tree")


sum(is.na(train_daily))
sum(is.na(test_daily))

train_daily <- na.omit(train_daily)
test_daily <- na.omit(test_daily)

library(randomForest)

rf_daily <- randomForest(
  log_order_volume ~ day + month + year + sale_day + cod_ratio,
  data = train_daily,
  ntree = 100
)

pred_rf_log <- predict(rf_daily, newdata = test_daily)
pred_rf <- expm1(pred_rf_log)

actual <- test_daily$order_volume

# Handle MAPE safely
mape_rf <- mean(abs(actual - pred_rf) / ifelse(actual == 0, 1, actual)) * 100
rmse_rf <- sqrt(mean((actual - pred_rf)^2))

cat("Random Forest Daily RMSE:", round(rmse_rf, 2), "\n")
cat("Random Forest Daily MAPE:", round(mape_rf, 2), "%\n")


library(xgboost)
library(dplyr)



# Assume 'daily_data' is your data frame
daily_data <- daily_data %>%
  select(order_volume, log_order_volume, cod_ratio, day, week, month, year, sale_day) %>%
  na.omit()

# Split into train and test
set.seed(123)
train_index <- sample(nrow(daily_data), 0.8 * nrow(daily_data))
train <- daily_data[train_index, ]
test <- daily_data[-train_index, ]

# Prepare matrix for XGBoost
train_matrix <- as.matrix(train %>% select(-order_volume, -log_order_volume))
test_matrix <- as.matrix(test %>% select(-order_volume, -log_order_volume))

dtrain <- xgb.DMatrix(data = train_matrix, label = train$log_order_volume)
dtest <- xgb.DMatrix(data = test_matrix, label = test$log_order_volume)


#tarin xgboost
xgb_model <- xgboost(
  data = dtrain,
  objective = "reg:squarederror",
  nrounds = 100,
  eta = 0.1,
  max_depth = 6,
  verbose = 0
)


pred_log <- predict(xgb_model, dtest)
pred <- expm1(pred_log)  # convert back from log scale


importance <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance)

predictors <- c("day", "week", "month", "year", "sale_day", "cod_ratio")
predictors <- c("week", "month", "year", "sale_week", "cod_ratio") #For weekly data:
names(daily_data)
names(weekly_data)

set.seed(123)
n <- nrow(daily_data)
train_index <- 1:round(0.8 * n)
train <- daily_data[train_index, ]
test <- daily_data[-train_index, ]


predictors <- c("cod_ratio", "day", "week", "month", "year", "sale_day")


library(xgboost)

dtrain <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test[, predictors]))

xgb_model <- xgboost(
  data = dtrain,
  objective = "reg:squarederror",
  nrounds = 100,
  verbose = 0
)

pred_log <- predict(xgb_model, dtest)
pred <- expm1(pred_log)
actual <- test$order_volume

# Align lengths
valid <- !is.na(actual) & !is.na(pred)
actual_valid <- actual[valid]
pred_valid <- pred[valid]

rmse <- sqrt(mean((actual_valid - pred_valid)^2))
mape <- mean(abs((actual_valid - pred_valid) / actual_valid)) * 100

cat("📈 XGBoost Daily RMSE:", round(rmse, 2), "\n")
cat("📉 XGBoost Daily MAPE:", round(mape, 2), "%\n")

set.seed(123)
n <- nrow(weekly_data)
train_index <- 1:round(0.8 * n)
train <- weekly_data[train_index, ]
test <- weekly_data[-train_index, ]
predictors <- c("cod_ratio", "week", "month", "year", "sale_week")

library(xgboost)

dtrain <- xgb.DMatrix(data = as.matrix(train[, predictors]), label = train$log_order_volume)
dtest <- xgb.DMatrix(data = as.matrix(test[, predictors]))

xgb_model <- xgboost(
  data = dtrain,
  objective = "reg:squarederror",
  nrounds = 100,
  verbose = 0
)


# Predict on test set (log scale)
pred_log <- predict(xgb_model, dtest)

# Transform back to original scale
pred <- expm1(pred_log)
actual <- test$order_volume

# Clean NAs and align lengths
valid <- !is.na(actual) & !is.na(pred)
actual_valid <- actual[valid]
pred_valid <- pred[valid]

# Manual RMSE
rmse <- sqrt(mean((actual_valid - pred_valid)^2))

# Manual MAPE
mape <- mean(abs((actual_valid - pred_valid) / actual_valid)) * 100

# Print results
cat("📈 XGBoost Weekly RMSE:", round(rmse, 2), "\n")
cat("📉 XGBoost Weekly MAPE:", round(mape, 2), "%\n")

✅ XGBoost RMSE: 2441.31 
✅ XGBoost MAPE: 51.32 %
[1] train-rmse:6.765721+0.047493    test-rmse:6.762781+0.208732 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

[11]    train-rmse:2.523166+0.022013    test-rmse:2.517144+0.231220 
[21]    train-rmse:1.078371+0.021622    test-rmse:1.103971+0.210685 
[31]    train-rmse:0.542628+0.017206    test-rmse:0.672093+0.137364 
[41]    train-rmse:0.310065+0.020436    test-rmse:0.538751+0.076045 
[51]    train-rmse:0.192368+0.014813    test-rmse:0.489178+0.080094 
[61]    train-rmse:0.129610+0.016878    test-rmse:0.465144+0.088088 
[71]    train-rmse:0.091339+0.010812    test-rmse:0.459908+0.087141 
[81]    train-rmse:0.068357+0.010891    test-rmse:0.457898+0.088854 
Stopping. Best iteration:
[76]    train-rmse:0.078756+0.010398    test-rmse:0.456925+0.088584

Best nrounds: 76 
✅ Tuned XGBoost RMSE: 2216.98 
✅ Tuned XGBoost MAPE: 47.85 %

‘category_final’
‘month_year’
‘month’
‘year’
‘order_volume’
‘sale_month’
‘cod_ratio’
‘log_order_volume’

Daily RMSE: 765.47 
Daily MAPE: 129.33 %
Weekly RMSE: 3483.02 
Weekly MAPE: 99.53 %

png

png

Random Forest Daily RMSE: 733.75 
Random Forest Daily MAPE: 58.34 %

png

‘order_volume’
‘log_order_volume’
‘cod_ratio’
‘day’
‘week’
‘month’
‘year’
‘sale_day’

‘week_start’
‘order_volume’
‘cod_ratio’
‘week’
‘month’
‘year’
‘sale_week’
‘log_order_volume’

📈 XGBoost Daily RMSE: 840.97 
📉 XGBoost Daily MAPE: 97.58 %
📈 XGBoost Weekly RMSE: 3974.95 
📉 XGBoost Weekly MAPE: 98.02 %

png

Pakistan’s Largest E-Commerce Dataset

Group 2