Setting Up The Dataset

Install and Load Libraries

This section installs and loads the required packages for data manipulation, visualization, and modelling.

# install.packages(c("readxl", "dplyr", "ggplot2", "lubridate", "caret", "randomForest", "xgboost", "corrplot", "stringr", "skimr", "tidyr"))

library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)
library(stringr)
library(skimr)
library(tidyr)
library(scales)
library(broom)
library(gridExtra)
library(factoextra)

Import Dataset

The dataset is imported from an Excel file and briefly inspected to understand its structure.

data <- read_excel("Advanced Topics Individual Dataset.xlsx")

knitr::kable(head(data))
Customer ID Age Gender Loyalty Member Product Type SKU Rating Order Status Payment Method Total Price Unit Price Quantity Purchase Date Shipping Type Add-ons Purchased Add-on Total
1000 53 Male No Smartphone SKU1004 2 Cancelled Credit Card 5538.33 791.19 7 2024-03-20 Standard Accessory,Accessory,Accessory 40.21
1000 53 Male No Tablet SKU1002 3 Completed Paypal 741.09 247.03 3 2024-04-20 Overnight Impulse Item 26.09
1002 41 Male No Laptop SKU1005 3 Completed Credit Card 1855.84 463.96 4 2023-10-17 Express NA 0.00
1002 41 Male Yes Smartphone SKU1004 2 Completed Cash 3164.76 791.19 4 2024-08-09 Overnight Impulse Item,Impulse Item 60.16
1003 75 Male Yes Smartphone SKU1001 5 Completed Cash 41.50 20.75 2 2024-05-21 Express Accessory 35.56
1004 41 Female No Smartphone SKU1001 5 Completed Credit Card 83.00 20.75 4 2024-05-26 Standard Impulse Item,Accessory 65.78

Data Cleaning

Standardizing Column Names

Column names are cleaned to ensure consistency and avoid issues in later analysis.

colnames(data) <- colnames(data) %>%
  str_replace_all(" ", "_") %>%
  str_replace_all("[^[:alnum:]_]", "")

knitr::kable(colnames(data))
x
Customer_ID
Age
Gender
Loyalty_Member
Product_Type
SKU
Rating
Order_Status
Payment_Method
Total_Price
Unit_Price
Quantity
Purchase_Date
Shipping_Type
Addons_Purchased
Addon_Total

Checking Missing Values

Missing values are checked across all variables.

na_summary <- sapply(data, function(x) sum(is.na(x)))
knitr::kable(na_summary)
x
Customer_ID 0
Age 0
Gender 1
Loyalty_Member 0
Product_Type 0
SKU 0
Rating 0
Order_Status 0
Payment_Method 0
Total_Price 0
Unit_Price 0
Quantity 0
Purchase_Date 0
Shipping_Type 0
Addons_Purchased 4868
Addon_Total 0

Handling Missing Values

Missing values in add-on related variables are handled.

data$Addons_Purchased[is.na(data$Addons_Purchased)] <- "None"
data$Addon_Total[is.na(data$Addon_Total)] <- 0

Fixing Inconsistencies

Categorical values are standardized for consistency.

data$Payment_Method <- str_to_title(data$Payment_Method)
data$Payment_Method[data$Payment_Method == "Paypal"] <- "PayPal"

Converting Data Types

Variables are converted into appropriate formats for analysis.

data$Purchase_Date <- as.Date(data$Purchase_Date)

data$Customer_ID <- as.character(data$Customer_ID)
data$Gender <- as.factor(data$Gender)
data$Loyalty_Member <- as.factor(data$Loyalty_Member)
data$Product_Type <- as.factor(data$Product_Type)
data$Payment_Method <- as.factor(data$Payment_Method)
data$Shipping_Type <- as.factor(data$Shipping_Type)
data$Order_Status <- as.factor(data$Order_Status)

str(data)
## tibble [20,000 × 16] (S3: tbl_df/tbl/data.frame)
##  $ Customer_ID     : chr [1:20000] "1000" "1000" "1002" "1002" ...
##  $ Age             : num [1:20000] 53 53 41 41 75 41 25 25 24 24 ...
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 1 2 2 ...
##  $ Loyalty_Member  : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 1 1 2 ...
##  $ Product_Type    : Factor w/ 5 levels "Headphones","Laptop",..: 3 5 2 3 3 3 4 2 3 2 ...
##  $ SKU             : chr [1:20000] "SKU1004" "SKU1002" "SKU1005" "SKU1004" ...
##  $ Rating          : num [1:20000] 2 3 3 2 5 5 3 3 2 3 ...
##  $ Order_Status    : Factor w/ 2 levels "Cancelled","Completed": 1 2 2 2 2 2 2 2 1 2 ...
##  $ Payment_Method  : Factor w/ 5 levels "Bank Transfer",..: 3 5 3 2 2 3 5 4 4 2 ...
##  $ Total_Price     : num [1:20000] 5538.3 741.1 1855.8 3164.8 41.5 ...
##  $ Unit_Price      : num [1:20000] 791.2 247 464 791.2 20.8 ...
##  $ Quantity        : num [1:20000] 7 3 4 4 2 4 9 9 7 9 ...
##  $ Purchase_Date   : Date[1:20000], format: "2024-03-20" "2024-04-20" ...
##  $ Shipping_Type   : Factor w/ 5 levels "Expedited","Express",..: 5 3 2 3 2 5 3 3 5 2 ...
##  $ Addons_Purchased: chr [1:20000] "Accessory,Accessory,Accessory" "Impulse Item" "None" "Impulse Item,Impulse Item" ...
##  $ Addon_Total     : num [1:20000] 40.2 26.1 0 60.2 35.6 ...

Summary of Clean Datasets

A summary of the cleaned dataset is displayed.

data_full <- data

data_clean <- data %>%
  filter(Order_Status == "Completed")

skim(data_clean)
Data summary
Name data_clean
Number of rows 13432
Number of columns 16
_______________________
Column type frequency:
character 3
Date 1
factor 6
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Customer_ID 0 1 4 5 0 9466 0
SKU 0 1 6 7 0 10 0
Addons_Purchased 0 1 4 55 0 76 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Purchase_Date 0 1 2023-09-24 2024-09-23 2024-04-24 366

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Gender 1 1 FALSE 2 Mal: 6780, Fem: 6651
Loyalty_Member 0 1 FALSE 2 No: 10512, Yes: 2920
Product_Type 0 1 FALSE 5 Sma: 4004, Tab: 2745, Lap: 2686, Sma: 2636
Order_Status 0 1 FALSE 1 Com: 13432, Can: 0
Payment_Method 0 1 FALSE 5 Cre: 3899, Pay: 3863, Ban: 2259, Cas: 1727
Shipping_Type 0 1 FALSE 5 Sta: 4561, Ove: 2247, Exp: 2227, Exp: 2210

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 48.97 18.06 18.00 33.00 49.00 65.00 80.00 ▇▇▇▇▇
Rating 0 1 3.10 1.23 1.00 2.00 3.00 4.00 5.00 ▂▃▇▂▅
Total_Price 0 1 3173.74 2541.48 20.75 1083.54 2534.49 4639.60 11396.80 ▇▅▃▂▁
Unit_Price 0 1 577.88 312.83 20.75 361.18 463.96 791.19 1139.68 ▂▇▂▆▂
Quantity 0 1 5.48 2.87 1.00 3.00 5.00 8.00 10.00 ▇▇▇▇▇
Addon_Total 0 1 62.21 57.96 0.00 7.19 52.11 93.56 292.77 ▇▅▂▁▁

Feature Engineering

Creating Transaction-Level Features

New variables are created to capture additional transaction characteristics.

# Create Transaction ID
data_clean <- data_clean %>%
  mutate(Transaction_ID = row_number())

# Extract Month from Purchase Date
data_clean$Month <- month(data_clean$Purchase_Date)

# Installment Payment (1 = Credit Card / PayPal, else 0)
data_clean$Installment_Payment <- ifelse(
  data_clean$Payment_Method %in% c("Credit Card", "PayPal"), 1, 0
)

# Handling Multi-Value Categorical Variable (Add-ons Purchased)
data_clean <- data_clean %>%
  mutate(Has_Addon = ifelse(
    is.na(Addons_Purchased) |
    Addons_Purchased == "" |
    tolower(trimws(Addons_Purchased)) == "none",
    0,
    str_count(Addons_Purchased, ",") + 1
  ))

addons_df <- data_clean %>%
  separate_rows(Addons_Purchased, sep = ",") %>%
  mutate(Addons_Purchased = str_trim(Addons_Purchased)) %>%
  filter(
    !is.na(Addons_Purchased),
    Addons_Purchased != "",
    tolower(Addons_Purchased) != "none"
  ) %>%
  group_by(Transaction_ID, Addons_Purchased) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = Addons_Purchased,
    values_from = count,
    values_fill = 0
  )

addon_cols <- setdiff(colnames(addons_df), "Transaction_ID")
data_clean <- data_clean %>%
  left_join(addons_df, by = "Transaction_ID") %>%
  mutate(across(all_of(addon_cols), ~replace_na(., 0)))

data_clean <- data_clean %>%
  select(
    Transaction_ID,
    Customer_ID,
    Age,
    Gender,
    Loyalty_Member,
    Purchase_Date,
    Month,
    Product_Type,
    SKU,
    Order_Status,
    Shipping_Type,
    Payment_Method,
    Installment_Payment,
    Quantity,
    Unit_Price,
    Total_Price,
    Addon_Total,
    Rating,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`
  )

# Check new variables
knitr::kable(head(data_clean))
Transaction_ID Customer_ID Age Gender Loyalty_Member Purchase_Date Month Product_Type SKU Order_Status Shipping_Type Payment_Method Installment_Payment Quantity Unit_Price Total_Price Addon_Total Rating Has_Addon Impulse Item Accessory Extended Warranty
1 1000 53 Male No 2024-04-20 4 Tablet SKU1002 Completed Overnight PayPal 1 3 247.03 741.09 26.09 3 1 1 0 0
2 1002 41 Male No 2023-10-17 10 Laptop SKU1005 Completed Express Credit Card 1 4 463.96 1855.84 0.00 3 0 0 0 0
3 1002 41 Male Yes 2024-08-09 8 Smartphone SKU1004 Completed Overnight Cash 0 4 791.19 3164.76 60.16 2 2 2 0 0
4 1003 75 Male Yes 2024-05-21 5 Smartphone SKU1001 Completed Express Cash 0 2 20.75 41.50 35.56 5 1 0 1 0
5 1004 41 Female No 2024-05-26 5 Smartphone SKU1001 Completed Standard Credit Card 1 4 20.75 83.00 65.78 5 2 1 1 0
6 1005 25 Female No 2024-01-30 1 Smartwatch SKU1003 Completed Overnight PayPal 1 9 844.83 7603.47 0.00 3 0 0 0 0

Creating Customer-Level Dataset

Transaction data is aggregated to the customer level to capture purchasing behaviour.

customer_data <- data_clean %>%
  group_by(Customer_ID) %>%
  summarise(
    Age = first(Age),
    Gender = first(Gender),
    Ever_Loyal = max(ifelse(Loyalty_Member == "Yes", 1, 0)),
    Total_Transactions = n(),
    Total_Spend = sum(Total_Price, na.rm = TRUE),
    Avg_Spend = mean(Total_Price, na.rm = TRUE),
    Total_Quantity = sum(Quantity, na.rm = TRUE),
    Has_Addon = sum(Has_Addon, na.rm = TRUE),
    `Impulse Item` = sum(`Impulse Item`, na.rm = TRUE),
    Accessory = sum(Accessory, na.rm = TRUE),
    `Extended Warranty` = sum(`Extended Warranty`, na.rm = TRUE),
    
    .groups = "drop"
  )

# View dataset
knitr::kable(head(customer_data))
Customer_ID Age Gender Ever_Loyal Total_Transactions Total_Spend Avg_Spend Total_Quantity Has_Addon Impulse Item Accessory Extended Warranty
1000 53 Male 0 1 741.09 741.090 3 1 1 0 0
1002 41 Male 1 2 5020.60 2510.300 8 2 2 0 0
1003 75 Male 1 1 41.50 41.500 2 1 0 1 0
1004 41 Female 0 1 83.00 83.000 4 2 1 1 0
1005 25 Female 0 2 11779.11 5889.555 18 2 0 0 2
1006 24 Male 1 2 6645.94 3322.970 19 3 2 1 0

Exploratory Data Analysis (EDA)

Variables Distribution in Table

The distribution of variables is summarised to identify patterns and variability.

eda_table <- function(df, var) {
  df %>%
    group_by({{ var }}) %>%
    summarise(
      Count = n(),
      Percentage = n() / nrow(df),
      Avg_Value = mean(Total_Price, na.rm = TRUE),
      Total_Value = sum(Total_Price, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(desc(Count))
}
knitr::kable(head(eda_table(data_clean, Age)))
Age Count Percentage Avg_Value Total_Value
61 259 0.0192823 3183.235 824458.0
40 253 0.0188356 2903.359 734549.8
28 246 0.0183145 2996.149 737052.8
67 244 0.0181656 3006.334 733545.5
50 238 0.0177189 3253.173 774255.1
37 236 0.0175700 3043.013 718151.0
knitr::kable(eda_table(data_clean, Gender))
Gender Count Percentage Avg_Value Total_Value
Male 6780 0.5047647 3152.216 21372025.22
Female 6651 0.4951608 3196.048 21256916.03
NA 1 0.0000744 674.320 674.32
knitr::kable(eda_table(data_clean, Month))
Month Count Percentage Avg_Value Total_Value
1 1399 0.1041543 3228.219 4516278
5 1371 0.1020697 3255.226 4462915
7 1356 0.1009529 3289.784 4460946
6 1349 0.1004318 3310.597 4465995
3 1323 0.0984961 3194.351 4226127
8 1323 0.0984961 3309.473 4378433
4 1284 0.0955926 3349.162 4300324
2 1227 0.0913490 3176.243 3897250
9 1125 0.0837552 3258.562 3665883
10 613 0.0456373 2537.236 1555326
11 537 0.0399792 2588.671 1390116
12 525 0.0390858 2495.282 1310023
knitr::kable(eda_table(data_clean, Product_Type))
Product_Type Count Percentage Avg_Value Total_Value
Smartphone 4004 0.2980941 3598.361 14407836
Tablet 2745 0.2043627 2813.345 7722632
Laptop 2686 0.1999702 3114.633 8365905
Smartwatch 2636 0.1962478 3565.475 9398591
Headphones 1361 0.1013252 2009.295 2734651
knitr::kable(eda_table(data_clean, SKU))
SKU Count Percentage Avg_Value Total_Value
SKU1002 1382 0.1028886 1360.6312 1880392.4
TBL345 1363 0.1014741 4286.3095 5842239.9
HDP456 1360 0.1012507 2007.7359 2730520.8
SKU1005 1352 0.1006552 2442.1142 3301738.4
SMP234 1346 0.1002085 6254.6925 8418816.2
LTP123 1334 0.0993151 3796.7148 5064817.5
SKU1001 1329 0.0989428 112.0875 148964.2
SKU1004 1329 0.0989428 4393.8335 5839404.7
SWT567 1325 0.0986450 2546.4970 3374108.5
SKU1003 1312 0.0976772 4594.9794 6028612.9
knitr::kable(eda_table(data_clean, Shipping_Type))
Shipping_Type Count Percentage Avg_Value Total_Value
Standard 4561 0.3395622 3154.360 14387038
Overnight 2247 0.1672871 2620.259 5887723
Express 2227 0.1657981 2529.005 5632093
Expedited 2210 0.1645325 3814.654 8430386
Same Day 2187 0.1628201 3791.667 8292376
knitr::kable(head(eda_table(data_clean, Quantity)))
Quantity Count Percentage Avg_Value Total_Value
2 1369 0.1019208 1150.322 1574791
3 1367 0.1017719 1719.858 2351047
8 1363 0.1014741 4631.195 6312319
9 1344 0.1000596 5236.902 7038397
4 1343 0.0999851 2308.096 3099773
6 1343 0.0999851 3517.713 4724289
knitr::kable(eda_table(data_clean, Rating))
Rating Count Percentage Avg_Value Total_Value
3 5339 0.3974836 3034.852 16203077
5 2677 0.1993002 1981.182 5303625
2 2642 0.1966945 4077.199 10771960
1 1392 0.1036331 3751.574 5222191
4 1382 0.1028886 3711.116 5128763
knitr::kable(eda_table(data_clean, Has_Addon))
Has_Addon Count Percentage Avg_Value Total_Value
2 3419 0.2545414 3110.484 10634744
3 3375 0.2512656 3185.780 10752008
1 3351 0.2494789 3201.507 10728250
0 3287 0.2447141 3198.848 10514615

Variables Distribution in Bar Charts

The distribution of variables is visualized to identify patterns and variability.

# Function to create clean bar chart
plot_bar <- function(df, var, title_name) {
  df %>%
    filter(!is.na({{ var }})) %>%
    ggplot(aes(x = as.factor({{ var }}))) +
    geom_bar(fill = "navyblue") +
    labs(
      title = title_name,
      x = NULL,
      y = NULL
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
      axis.text.y = element_text(size = 8)
    )
}

# Create plots
p1 <- ggplot(data_clean, aes(x = Age)) +
  geom_histogram(fill = "navyblue", bins = 20, color = "white", linedwidth = 0.2) +
  scale_x_continuous(breaks = seq(0, 100, by = 10), limits = c(15, 85)) +
  labs(title = "Age", x = NULL, y = NULL) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8)
  )
p2 <- plot_bar(data_clean, Gender, "Gender")
p3 <- plot_bar(data_clean, Month, "Month")
p4 <- plot_bar(data_clean, Product_Type, "Product Type")
p5 <- plot_bar(data_clean, SKU, "SKU")
p6 <- plot_bar(data_clean, Shipping_Type, "Shipping Type")
p7 <- plot_bar(data_clean, Quantity, "Quantity")
p8 <- plot_bar(data_clean, Rating, "Rating")
p9 <- plot_bar(data_clean, Has_Addon, "Has Addon")

# Arrange in 3x3 grid
grid.arrange(p1, p2, p3,
             p4, p5, p6,
             p7, p8, p9,
             ncol = 3)

Product Type and SKU Allocation

Displays the relationship between product types and SKUs

sku_table <- data_clean %>%
  select(Product_Type, SKU) %>%
  distinct() %>%
  arrange(Product_Type, SKU)

knitr::kable(sku_table)
Product_Type SKU
Headphones HDP456
Headphones SKU1003
Laptop LTP123
Laptop SKU1004
Laptop SKU1005
Smartphone SKU1001
Smartphone SKU1004
Smartphone SKU1005
Smartphone SMP234
Smartwatch SKU1003
Smartwatch SWT567
Tablet SKU1002
Tablet TBL345

Time Series Analysis of Monthly Spending

Shows how spending changes across months, highlighting overall fluctuations over time

# Prepare monthly data
monthly_df <- data_clean %>%
  group_by(Month) %>%
  summarise(
    Total_Spend = sum(Total_Price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(Month)

# Convert Month (numeric) to Jan–Dec
monthly_df$Month <- factor(monthly_df$Month,
                           levels = 1:12,
                           labels = c("Jan","Feb","Mar","Apr","May","Jun",
                                      "Jul","Aug","Sep","Oct","Nov","Dec"))

# Plot
p <- ggplot(monthly_df, aes(x = Month, y = Total_Spend, group = 1)) +
  geom_line(color = "navyblue", linewidth = 0.9) +
  geom_point(color = "navyblue", size = 2.5) +
  
  geom_smooth(method = "lm", se = FALSE,
              color = "red",
              linetype = "dashed",
              linewidth = 0.8) +
  
  labs(
    title = "Monthly Total Spend",
    x = "Month",
    y = "Total Spend"
  ) +
  
  scale_y_continuous(labels = comma) +
  
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
  )

print(p)

Research Question 1: What factors drive revenue in electronics retail transactions?

Data Preparation for Modelling (Transaction-Level)

A transaction-level dataset is prepared for modelling.

model_df <- data_clean %>%
  select(
    Total_Price,
    Product_Type,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`,
    Installment_Payment,
    Payment_Method,
    Loyalty_Member,
    Age,
    Gender,
    Shipping_Type
  ) %>%
  na.omit()

model_df <- model_df %>%
  rename(
    Impulse_Item = `Impulse Item`,
    Extended_Warranty = `Extended Warranty`
  ) %>%
  mutate(
    across(c(Product_Type, Has_Addon, Installment_Payment,
             Payment_Method, Loyalty_Member, Gender,
             Shipping_Type),
           as.factor)
  )

Linear Regression

A linear regression model is fitted to identify significant predictors of total price.

model_lm <- lm(Total_Price ~ ., data = model_df)
knitr::kable(tidy(model_lm))
term estimate std.error statistic p.value
(Intercept) 2532.9483495 110.547949 22.9126671 0.0000000
Product_TypeLaptop 1741.8436603 82.088398 21.2191211 0.0000000
Product_TypeSmartphone 2407.8120918 79.228278 30.3908169 0.0000000
Product_TypeSmartwatch 2182.1868738 82.267429 26.5255264 0.0000000
Product_TypeTablet 1432.5368643 81.744659 17.5245317 0.0000000
Has_Addon1 -23.9619200 63.405396 -0.3779161 0.7054989
Has_Addon2 -80.4262948 75.909889 -1.0594969 0.2893926
Has_Addon3 -5.6159336 92.603866 -0.0606447 0.9516431
Impulse_Item -10.1411420 41.208594 -0.2460929 0.8056141
Accessory 30.4777352 41.053373 0.7423930 0.4578623
Extended_Warranty NA NA NA NA
Installment_Payment1 -288.8079095 65.834027 -4.3869094 0.0000116
Payment_MethodCash -760.0698786 86.799571 -8.7566087 0.0000000
Payment_MethodCredit Card -67.9405732 54.442797 -1.2479258 0.2120800
Payment_MethodDebit Card -582.1749068 87.284872 -6.6698259 0.0000000
Payment_MethodPayPal NA NA NA NA
Loyalty_MemberYes -50.7459147 50.180250 -1.0112726 0.3119042
Age 0.4062262 1.145615 0.3545924 0.7229006
GenderMale -45.1755820 41.398827 -1.0912285 0.2751919
Shipping_TypeExpress -1509.1345681 79.742130 -18.9251850 0.0000000
Shipping_TypeOvernight -1400.7894690 79.955822 -17.5195430 0.0000000
Shipping_TypeSame Day -52.6481736 72.337531 -0.7278127 0.4667410
Shipping_TypeStandard -776.9077880 64.460805 -12.0524059 0.0000000

Random Forest

A random forest model is fitted to identify important features driving total price.

set.seed(123)
model_rf <- randomForest(
  Total_Price ~ .,
  data = model_df,
  importance = TRUE
)

importance_df <- as.data.frame(importance(model_rf))
importance_df$Variable <- rownames(importance_df)

ggplot(importance_df, aes(x = reorder(Variable, `%IncMSE`), y = `%IncMSE`)) +
  geom_col(aes(fill = `%IncMSE` < 25)) +
  scale_fill_manual(
    values = c("TRUE" = "darkred", "FALSE" = "navyblue"),
    guide = "none"
  ) +
  coord_flip() +
  labs(
    title = "Feature Importance (Random Forest)",
    subtitle = "Low-importance variables highlighted in red",
    x = NULL,
    y = "% Increase in MSE"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    axis.text = element_text(size = 10)
  )

Research Question 2: Can customers be segmented into meaningful groups?

Data Preparation for Modelling (Customer-Level)

A Customer-level dataset is prepared for modelling.

clustering_df <- customer_data %>%
  select(
    Age,
    Ever_Loyal,
    Total_Transactions,
    Total_Spend,
    Avg_Spend,
    Total_Quantity,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`
  )

clustering_scaled <- scale(clustering_df)

Find Optimal Clusters with Elbow Method

The elbow method is used to determine the optimal number of clusters.

fviz_nbclust(clustering_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal Clusters")

K-Means Clustering

K-means clustering is performed to segment customers into groups based on their purchasing behaviour.

set.seed(123)

kmeans_model <- kmeans(clustering_scaled, centers = 3, nstart = 25)

fviz_cluster(kmeans_model, data = clustering_scaled)

clustering_df$Cluster <- as.factor(kmeans_model$cluster)

cluster_profiling <- clustering_df %>%
  group_by(Cluster) %>%
  summarise(
    Avg_Age = mean(Age),
    Avg_Loyalty = mean(Ever_Loyal),
    Avg_Spend = mean(Total_Spend),
    Avg_Transactions = mean(Total_Transactions),
    Avg_Quantity = mean(Total_Quantity),
    Avg_Addon = mean(Has_Addon)
  )

knitr::kable(cluster_profiling)
Cluster Avg_Age Avg_Loyalty Avg_Spend Avg_Transactions Avg_Quantity Avg_Addon
1 48.38653 0.4834197 8218.609 2.435751 14.007772 4.435233
2 49.61802 0.2372881 7054.542 1.248863 9.212071 1.489872
3 49.26129 0.2282587 1896.182 1.115888 4.754935 1.592730

Research Question 3: How do payment methods relate to purchasing behaviour?

Data Preparation

Payment method and installment payment summaries are created.

payment_summary <- data_clean %>%
  mutate(
    Payment_Group = ifelse(
      Payment_Method %in% c("PayPal", "Credit Card"),
      Payment_Method,
      "Other"
    )
  ) %>%
  group_by(Payment_Method, Payment_Group) %>%
  summarise(
    Transaction_Count = n(),
    .groups = "drop"
  )

installment_summary <- data_clean %>%
  mutate(Installment_Label = ifelse(Installment_Payment == 1, "Yes", "No")) %>%
  group_by(Installment_Label) %>%
  summarise(
    Transaction_Count = n(),
    .groups = "drop"
  )

Transaction by Payment Method Plot

This plot compares the transaction counts for different payment methods.

ggplot(payment_summary, aes(x = Payment_Method, y = Transaction_Count)) +
  geom_col(aes(fill = Payment_Method %in% c("PayPal", "Credit Card"))) +
  scale_fill_manual(
    values = c("TRUE" = "navyblue", "FALSE" = "darkred"),
    guide = "none"
  ) +
  labs(
    title = "Transaction by Payment Method",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
    
  )

Transaction by Instalment Payment Plot

This plot compares the transaction counts for customers who used installment payment versus those who did not.

ggplot(installment_summary, aes(x = Installment_Label, y = Transaction_Count)) +
  geom_col(aes(fill = Installment_Label)) +
  scale_fill_manual(
    values = c("Yes" = "navyblue", "No" = "darkred"),
    guide = "none"
  ) +
  labs(
    title = "Transaction by Instalment Payment",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
  )

Payment Behavior Analysis

This table summarizes the average spend and quantity for customers based on whether they used installment payment or not

payment_behavior <- data_clean %>%
  group_by(Installment_Payment) %>%
  summarise(
    Avg_Spend = round(mean(Total_Price), 2),
    Avg_Quantity = round(mean(Quantity), 2),
    .groups = "drop"
  )

knitr::kable(payment_behavior)
Installment_Payment Avg_Spend Avg_Quantity
0 3066.85 5.50
1 3251.81 5.47